| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN |
| ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 |
| ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 |
| ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 |
| ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 |
| ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 |
| ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 |
| |
| declare bfloat @llvm.copysign.bf16(bfloat, bfloat) |
| |
| define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { |
| ; GCN-LABEL: v_copysign_bf16_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_bf16_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_bf16_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_bf16_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_bf16_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_bf16_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| ret bfloat %op |
| } |
| |
| define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { |
| ; GCN-LABEL: v_copysign_bf16_s_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: s_and_b32 s4, s16, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s4, s4, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s4, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_bf16_s_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s4, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_bf16_s_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_bf16_s_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s16 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_bf16_s_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s16 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_bf16_s_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| ret bfloat %op |
| } |
| |
| define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { |
| ; GCN-LABEL: v_copysign_s_bf16_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_s_bf16_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_s_bf16_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_s_bf16_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s16 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_s_bf16_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s16, v0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_s_bf16_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| ret bfloat %op |
| } |
| |
| define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { |
| ; GCN-LABEL: v_copysign_bf16_f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_bf16_f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_bf16_f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_bf16_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_bf16_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_bf16_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %sign = fptrunc float %sign.f32 to bfloat |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| ret bfloat %op |
| } |
| |
| define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { |
| ; GCN-LABEL: v_copysign_bf16_f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_bf16_f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_bf16_f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_bf16_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_bf16_f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_bf16_f64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %sign = fptrunc double %sign.f64 to bfloat |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| ret bfloat %op |
| } |
| |
| define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) { |
| ; GCN-LABEL: v_copysign_bf16_f16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_bf16_f16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_bf16_f16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_bf16_f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_bf16_f16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_bf16_f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %sign = bitcast half %sign.f16 to bfloat |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| ret bfloat %op |
| } |
| |
| define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) { |
| ; GCN-LABEL: s_copysign_bf16_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_bf16_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_bf16_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_movk_i32 s2, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_bf16_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_movk_i32 s2, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_bf16_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_bf16_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| %cast = bitcast bfloat %op to i16 |
| %zext = zext i16 %cast to i32 |
| %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) |
| ret i32 %readlane |
| } |
| |
| define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) { |
| ; GCN-LABEL: s_copysign_bf16_f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_bf16_f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_bf16_f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX8-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_bf16_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX9-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_bf16_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_bf16_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign = fptrunc float %sign.f32 to bfloat |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| %cast = bitcast bfloat %op to i16 |
| %zext = zext i16 %cast to i32 |
| %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) |
| ret i32 %readlane |
| } |
| |
| define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) { |
| ; GCN-LABEL: s_copysign_bf16_f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_bf16_f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_bf16_f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX8-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_bf16_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX9-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_bf16_f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_bf16_f64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign = fptrunc double %sign.f64 to bfloat |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| %cast = bitcast bfloat %op to i16 |
| %zext = zext i16 %cast to i32 |
| %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) |
| ret i32 %readlane |
| } |
| |
| define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) { |
| ; GCN-LABEL: s_copysign_bf16_f16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_bf16_f16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_bf16_f16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_movk_i32 s2, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_bf16_f16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_movk_i32 s2, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_bf16_f16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_bf16_f16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign = bitcast half %sign.f16 to bfloat |
| %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) |
| %cast = bitcast bfloat %op to i16 |
| %zext = zext i16 %cast to i32 |
| %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) |
| ret i32 %readlane |
| } |
| |
| declare float @llvm.copysign.f32(float, float) |
| |
| define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { |
| ; GCN-LABEL: v_copysign_f32_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_f32_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_f32_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_f32_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_f32_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_f32_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %sign = fpext bfloat %sign.bf16 to float |
| %op = call float @llvm.copysign.f32(float %mag, float %sign) |
| ret float %op |
| } |
| |
| define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) { |
| ; GCN-LABEL: s_copysign_f32_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_brev_b32 s2, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v0, s0 |
| ; GCN-NEXT: v_mov_b32_e32 v1, s1 |
| ; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_f32_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_brev_b32 s2, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX7-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_f32_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX8-NEXT: s_brev_b32 s1, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_f32_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX9-NEXT: s_brev_b32 s1, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_f32_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_f32_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign = fpext bfloat %sign.bf16 to float |
| %op = call float @llvm.copysign.f32(float %mag, float %sign) |
| %cast = bitcast float %op to i32 |
| %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) |
| ret i32 %readlane |
| } |
| |
| declare half @llvm.copysign.f16(half, half) |
| |
| define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) { |
| ; GCN-LABEL: v_copysign_f16_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 |
| ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_f16_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 |
| ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_f16_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_f16_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_f16_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_f16_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %sign = bitcast bfloat %sign.bf16 to half |
| %op = call half @llvm.copysign.f16(half %mag, half %sign) |
| ret half %op |
| } |
| |
| define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) { |
| ; GCN-LABEL: s_copysign_f16_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 |
| ; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 |
| ; GCN-NEXT: s_brev_b32 s0, -2 |
| ; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0 |
| ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_f16_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 |
| ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 |
| ; GFX7-NEXT: s_brev_b32 s0, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1 |
| ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_f16_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_movk_i32 s2, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_f16_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_movk_i32 s2, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_f16_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_f16_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign = bitcast bfloat %sign.bf16 to half |
| %op = call half @llvm.copysign.f16(half %mag, half %sign) |
| %cast = bitcast half %op to i16 |
| %zext = zext i16 %cast to i32 |
| %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) |
| ret i32 %readlane |
| } |
| |
| declare double @llvm.copysign.f64(double, double) |
| |
| define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { |
| ; GCN-LABEL: v_copysign_f64_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_f64_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_f64_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_f64_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_f64_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_f64_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %sign = fpext bfloat %sign.bf16 to double |
| %op = call double @llvm.copysign.f64(double %mag, double %sign) |
| ret double %op |
| } |
| |
| define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) { |
| ; GCN-LABEL: s_copysign_f64_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_brev_b32 s3, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v0, s1 |
| ; GCN-NEXT: v_mov_b32_e32 v1, s2 |
| ; GCN-NEXT: v_bfi_b32 v0, s3, v0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_f64_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_brev_b32 s3, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s3, v0, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_f64_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX8-NEXT: s_brev_b32 s2, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_f64_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX9-NEXT: s_brev_b32 s2, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_f64_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_f64_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign = fpext bfloat %sign.bf16 to double |
| %op = call double @llvm.copysign.f64(double %mag, double %sign) |
| %cast = bitcast double %op to <2 x i32> |
| %cast.0 = extractelement <2 x i32> %cast, i32 0 |
| %cast.1 = extractelement <2 x i32> %cast, i32 1 |
| %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) |
| %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) |
| %ins.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 |
| %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1 |
| ret <2 x i32> %ins.1 |
| } |
| |
| define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat> inreg %arg_sign) { |
| ; GCN-LABEL: s_copysign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2 |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_mov_b32 s2, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_mov_b32 s2, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_v2bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %arg_mag, <2 x bfloat> %arg_sign) |
| %cast = bitcast <2 x bfloat> %out to i32 |
| ret i32 %cast |
| } |
| |
| define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x bfloat> inreg %arg_sign) { |
| ; GCN-LABEL: s_copysign_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 |
| ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v2, v5, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v4, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v3, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v1 |
| ; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 16 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GCN-NEXT: v_readfirstlane_b32 s2, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 |
| ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 |
| ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s0 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 |
| ; GFX7-NEXT: v_alignbit_b32 v1, v0, v1, 16 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s2, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_v3bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %out = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %arg_mag, <3 x bfloat> %arg_sign) |
| %cast = bitcast <3 x bfloat> %out to <3 x i16> |
| ret <3 x i16> %cast |
| } |
| |
| define amdgpu_ps <2 x i32> @s_copysign_v4bf16(<4 x bfloat> inreg %arg_mag, <4 x bfloat> inreg %arg_sign) { |
| ; GCN-LABEL: s_copysign_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s7 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s6 |
| ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s0 |
| ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v3, v7, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v6, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v5, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v4, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s6 |
| ; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s2 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v6, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s0 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v7, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v3, v5, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v3, v4, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_v4bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %arg_mag, <4 x bfloat> %arg_sign) |
| %cast = bitcast <4 x bfloat> %out to <2 x i32> |
| ret <2 x i32> %cast |
| } |
| |
| define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x bfloat> inreg %arg_sign) { |
| ; GCN-LABEL: s_copysign_v8bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s9 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s8 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s11 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s10 |
| ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s13 |
| ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s12 |
| ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s15 |
| ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s14 |
| ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s0 |
| ; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s2 |
| ; GCN-NEXT: v_mul_f32_e64 v12, 1.0, s5 |
| ; GCN-NEXT: v_mul_f32_e64 v13, 1.0, s4 |
| ; GCN-NEXT: v_mul_f32_e64 v14, 1.0, s7 |
| ; GCN-NEXT: v_mul_f32_e64 v15, 1.0, s6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v7, v15, v7 |
| ; GCN-NEXT: v_or_b32_e32 v6, v14, v6 |
| ; GCN-NEXT: v_or_b32_e32 v5, v13, v5 |
| ; GCN-NEXT: v_or_b32_e32 v4, v12, v4 |
| ; GCN-NEXT: v_or_b32_e32 v3, v11, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v10, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v9, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v8, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v6, v7, v6 |
| ; GCN-NEXT: v_or_b32_e32 v4, v5, v4 |
| ; GCN-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GCN-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GCN-NEXT: v_readfirstlane_b32 s3, v6 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_v8bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s14 |
| ; GFX7-NEXT: v_mul_f32_e64 v14, 1.0, s7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s6 |
| ; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s12 |
| ; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v14, v6 |
| ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s13 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s4 |
| ; GFX7-NEXT: v_or_b32_e32 v7, v15, v7 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_mul_f32_e64 v12, 1.0, s5 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v7, v13, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v7, v12, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s10 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s11 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s3 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v5, v11, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v5, v10, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s8 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s9 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s1 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v3, v9, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v3, v8, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s3, v6 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_v8bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_mov_b32 s8, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s3 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s7 |
| ; GFX8-NEXT: v_bfi_b32 v0, s8, v0, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s6 |
| ; GFX8-NEXT: v_bfi_b32 v1, s8, v1, v2 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX8-NEXT: v_bfi_b32 v2, s8, v2, v3 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v4, s4 |
| ; GFX8-NEXT: v_bfi_b32 v3, s8, v3, v4 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s3, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_v8bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s3 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s7 |
| ; GFX9-NEXT: v_bfi_b32 v0, s8, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s6 |
| ; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s5 |
| ; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, s4 |
| ; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_v8bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s5 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s6 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s7 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 |
| ; GFX10-NEXT: v_readfirstlane_b32 s3, v3 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_v8bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2 |
| ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v3 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %out = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %arg_mag, <8 x bfloat> %arg_sign) |
| %cast = bitcast <8 x bfloat> %out to <4 x i32> |
| ret <4 x i32> %cast |
| } |
| |
| define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 x bfloat> inreg %arg_sign) { |
| ; GCN-LABEL: s_copysign_v16bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s17 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s19 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s18 |
| ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s21 |
| ; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s20 |
| ; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s23 |
| ; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s22 |
| ; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s25 |
| ; GCN-NEXT: v_mul_f32_e64 v9, 1.0, s24 |
| ; GCN-NEXT: v_mul_f32_e64 v10, 1.0, s27 |
| ; GCN-NEXT: v_mul_f32_e64 v11, 1.0, s26 |
| ; GCN-NEXT: v_mul_f32_e64 v12, 1.0, s29 |
| ; GCN-NEXT: v_mul_f32_e64 v13, 1.0, s28 |
| ; GCN-NEXT: v_mul_f32_e64 v14, 1.0, s31 |
| ; GCN-NEXT: v_mul_f32_e64 v15, 1.0, s30 |
| ; GCN-NEXT: v_mul_f32_e64 v16, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v17, 1.0, s0 |
| ; GCN-NEXT: v_mul_f32_e64 v18, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s14 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v15, 0x8000, v15 |
| ; GCN-NEXT: v_or_b32_e32 v15, v19, v15 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v14, 0x8000, v14 |
| ; GCN-NEXT: v_or_b32_e32 v14, v19, v14 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s12 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v13, 0x8000, v13 |
| ; GCN-NEXT: v_or_b32_e32 v13, v19, v13 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s13 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v12, 0x8000, v12 |
| ; GCN-NEXT: v_or_b32_e32 v12, v19, v12 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s10 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v11, 0x8000, v11 |
| ; GCN-NEXT: v_or_b32_e32 v11, v19, v11 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s11 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v10, 0x8000, v10 |
| ; GCN-NEXT: v_or_b32_e32 v10, v19, v10 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s8 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v9, 0x8000, v9 |
| ; GCN-NEXT: v_or_b32_e32 v9, v19, v9 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s9 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v8, 0x8000, v8 |
| ; GCN-NEXT: v_or_b32_e32 v8, v19, v8 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GCN-NEXT: v_or_b32_e32 v7, v19, v7 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GCN-NEXT: v_or_b32_e32 v6, v19, v6 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_or_b32_e32 v5, v19, v5 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_or_b32_e32 v4, v19, v4 |
| ; GCN-NEXT: v_mul_f32_e64 v19, 1.0, s2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v3, v19, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v18, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v17, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v14, v15, v14 |
| ; GCN-NEXT: v_or_b32_e32 v12, v13, v12 |
| ; GCN-NEXT: v_or_b32_e32 v10, v11, v10 |
| ; GCN-NEXT: v_or_b32_e32 v8, v9, v8 |
| ; GCN-NEXT: v_or_b32_e32 v6, v7, v6 |
| ; GCN-NEXT: v_or_b32_e32 v4, v5, v4 |
| ; GCN-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GCN-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GCN-NEXT: v_readfirstlane_b32 s3, v6 |
| ; GCN-NEXT: v_readfirstlane_b32 s4, v8 |
| ; GCN-NEXT: v_readfirstlane_b32 s5, v10 |
| ; GCN-NEXT: v_readfirstlane_b32 s6, v12 |
| ; GCN-NEXT: v_readfirstlane_b32 s7, v14 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_v16bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s30 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s14 |
| ; GFX7-NEXT: v_mul_f32_e64 v14, 1.0, s31 |
| ; GFX7-NEXT: v_and_b32_e32 v15, 0x8000, v15 |
| ; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 |
| ; GFX7-NEXT: v_or_b32_e32 v15, v19, v15 |
| ; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s15 |
| ; GFX7-NEXT: v_and_b32_e32 v14, 0x8000, v14 |
| ; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v14, v19, v14 |
| ; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s28 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 |
| ; GFX7-NEXT: v_or_b32_e32 v14, v15, v14 |
| ; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s12 |
| ; GFX7-NEXT: v_mul_f32_e64 v12, 1.0, s29 |
| ; GFX7-NEXT: v_and_b32_e32 v13, 0x8000, v13 |
| ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 |
| ; GFX7-NEXT: v_or_b32_e32 v13, v15, v13 |
| ; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s13 |
| ; GFX7-NEXT: v_and_b32_e32 v12, 0x8000, v12 |
| ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v12, v15, v12 |
| ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s26 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GFX7-NEXT: v_or_b32_e32 v12, v13, v12 |
| ; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s10 |
| ; GFX7-NEXT: v_mul_f32_e64 v10, 1.0, s27 |
| ; GFX7-NEXT: v_and_b32_e32 v11, 0x8000, v11 |
| ; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: v_or_b32_e32 v11, v13, v11 |
| ; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s11 |
| ; GFX7-NEXT: v_and_b32_e32 v10, 0x8000, v10 |
| ; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v10, v13, v10 |
| ; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s24 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GFX7-NEXT: v_or_b32_e32 v10, v11, v10 |
| ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s8 |
| ; GFX7-NEXT: v_mul_f32_e64 v8, 1.0, s25 |
| ; GFX7-NEXT: v_and_b32_e32 v9, 0x8000, v9 |
| ; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: v_or_b32_e32 v9, v11, v9 |
| ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s9 |
| ; GFX7-NEXT: v_and_b32_e32 v8, 0x8000, v8 |
| ; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v8, v11, v8 |
| ; GFX7-NEXT: v_mul_f32_e64 v7, 1.0, s22 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: v_mul_f32_e64 v6, 1.0, s23 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_or_b32_e32 v8, v9, v8 |
| ; GFX7-NEXT: v_mul_f32_e64 v9, 1.0, s6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_mul_f32_e64 v11, 1.0, s7 |
| ; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v7, v9, v7 |
| ; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GFX7-NEXT: v_bfe_u32 v9, v11, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s20 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v9, v6 |
| ; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s21 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_mul_f32_e64 v13, 1.0, s4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_mul_f32_e64 v15, 1.0, s5 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v7, v6 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v7, v13, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v5, v7, v5 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v7, v15, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s18 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v7, v4 |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s19 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_mul_f32_e64 v19, 1.0, s2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v18, 1.0, s3 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v5, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v5, v19, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v5, v18, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s17 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_mul_f32_e64 v17, 1.0, s0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_mul_f32_e64 v16, 1.0, s1 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v3, v17, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v3, v16, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s2, v4 |
| ; GFX7-NEXT: v_readfirstlane_b32 s3, v6 |
| ; GFX7-NEXT: v_readfirstlane_b32 s4, v8 |
| ; GFX7-NEXT: v_readfirstlane_b32 s5, v10 |
| ; GFX7-NEXT: v_readfirstlane_b32 s6, v12 |
| ; GFX7-NEXT: v_readfirstlane_b32 s7, v14 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_v16bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_mov_b32 s16, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s7 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s15 |
| ; GFX8-NEXT: v_bfi_b32 v0, s16, v0, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s6 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s14 |
| ; GFX8-NEXT: v_bfi_b32 v1, s16, v1, v2 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s5 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, s13 |
| ; GFX8-NEXT: v_bfi_b32 v2, s16, v2, v3 |
| ; GFX8-NEXT: v_mov_b32_e32 v3, s4 |
| ; GFX8-NEXT: v_mov_b32_e32 v4, s12 |
| ; GFX8-NEXT: v_bfi_b32 v3, s16, v3, v4 |
| ; GFX8-NEXT: v_mov_b32_e32 v4, s3 |
| ; GFX8-NEXT: v_mov_b32_e32 v5, s11 |
| ; GFX8-NEXT: v_bfi_b32 v4, s16, v4, v5 |
| ; GFX8-NEXT: v_mov_b32_e32 v5, s2 |
| ; GFX8-NEXT: v_mov_b32_e32 v6, s10 |
| ; GFX8-NEXT: v_bfi_b32 v5, s16, v5, v6 |
| ; GFX8-NEXT: v_mov_b32_e32 v6, s1 |
| ; GFX8-NEXT: v_mov_b32_e32 v7, s9 |
| ; GFX8-NEXT: v_bfi_b32 v6, s16, v6, v7 |
| ; GFX8-NEXT: v_mov_b32_e32 v7, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v8, s8 |
| ; GFX8-NEXT: v_bfi_b32 v7, s16, v7, v8 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v7 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v6 |
| ; GFX8-NEXT: v_readfirstlane_b32 s2, v5 |
| ; GFX8-NEXT: v_readfirstlane_b32 s3, v4 |
| ; GFX8-NEXT: v_readfirstlane_b32 s4, v3 |
| ; GFX8-NEXT: v_readfirstlane_b32 s5, v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s6, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s7, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_v16bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_mov_b32 s16, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s7 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s15 |
| ; GFX9-NEXT: v_bfi_b32 v0, s16, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s6 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s14 |
| ; GFX9-NEXT: v_bfi_b32 v1, s16, v1, v2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s5 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s13 |
| ; GFX9-NEXT: v_bfi_b32 v2, s16, v2, v3 |
| ; GFX9-NEXT: v_mov_b32_e32 v3, s4 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, s12 |
| ; GFX9-NEXT: v_bfi_b32 v3, s16, v3, v4 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, s3 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, s11 |
| ; GFX9-NEXT: v_bfi_b32 v4, s16, v4, v5 |
| ; GFX9-NEXT: v_mov_b32_e32 v5, s2 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s10 |
| ; GFX9-NEXT: v_bfi_b32 v5, s16, v5, v6 |
| ; GFX9-NEXT: v_mov_b32_e32 v6, s1 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, s9 |
| ; GFX9-NEXT: v_bfi_b32 v6, s16, v6, v7 |
| ; GFX9-NEXT: v_mov_b32_e32 v7, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, s8 |
| ; GFX9-NEXT: v_bfi_b32 v7, s16, v7, v8 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v7 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v6 |
| ; GFX9-NEXT: v_readfirstlane_b32 s2, v5 |
| ; GFX9-NEXT: v_readfirstlane_b32 s3, v4 |
| ; GFX9-NEXT: v_readfirstlane_b32 s4, v3 |
| ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s6, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_v16bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s15 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s14 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, s13 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, s8 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, s9 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, s10 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, s11 |
| ; GFX10-NEXT: v_mov_b32_e32 v7, s12 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s7, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s6, v1 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, s5, v2 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, s0, v3 |
| ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff7fff, s1, v4 |
| ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff7fff, s2, v5 |
| ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff7fff, s3, v6 |
| ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff7fff, s4, v7 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 |
| ; GFX10-NEXT: v_readfirstlane_b32 s2, v5 |
| ; GFX10-NEXT: v_readfirstlane_b32 s3, v6 |
| ; GFX10-NEXT: v_readfirstlane_b32 s4, v7 |
| ; GFX10-NEXT: v_readfirstlane_b32 s5, v2 |
| ; GFX10-NEXT: v_readfirstlane_b32 s6, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s7, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_v16bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v3, s8 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10 |
| ; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s12 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s7, v0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s6, v1 |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s5, v2 |
| ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s0, v3 |
| ; GFX11-NEXT: v_bfi_b32 v4, 0x7fff7fff, s1, v4 |
| ; GFX11-NEXT: v_bfi_b32 v5, 0x7fff7fff, s2, v5 |
| ; GFX11-NEXT: v_bfi_b32 v6, 0x7fff7fff, s3, v6 |
| ; GFX11-NEXT: v_bfi_b32 v7, 0x7fff7fff, s4, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v3 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 |
| ; GFX11-NEXT: v_readfirstlane_b32 s2, v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v6 |
| ; GFX11-NEXT: v_readfirstlane_b32 s4, v7 |
| ; GFX11-NEXT: v_readfirstlane_b32 s5, v2 |
| ; GFX11-NEXT: v_readfirstlane_b32 s6, v1 |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %out = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> %arg_mag, <16 x bfloat> %arg_sign) |
| %cast = bitcast <16 x bfloat> %out to <8 x i32> |
| ret <8 x i32> %cast |
| } |
| |
| define <2 x bfloat> @v_copysign_v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_v2bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %result = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign) |
| ret <2 x bfloat> %result |
| } |
| |
| define <3 x bfloat> @v_copysign_v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_v3bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %result = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign) |
| ret <3 x bfloat> %result |
| } |
| |
| define <4 x bfloat> @v_copysign_v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_v4bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %result = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign) |
| ret <4 x bfloat> %result |
| } |
| |
| define <8 x bfloat> @v_copysign_v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_v8bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 |
| ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 |
| ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 |
| ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v15, 0x8000, v15 |
| ; GCN-NEXT: v_and_b32_e32 v14, 0x8000, v14 |
| ; GCN-NEXT: v_and_b32_e32 v13, 0x8000, v13 |
| ; GCN-NEXT: v_and_b32_e32 v12, 0x8000, v12 |
| ; GCN-NEXT: v_and_b32_e32 v11, 0x8000, v11 |
| ; GCN-NEXT: v_and_b32_e32 v10, 0x8000, v10 |
| ; GCN-NEXT: v_and_b32_e32 v9, 0x8000, v9 |
| ; GCN-NEXT: v_and_b32_e32 v8, 0x8000, v8 |
| ; GCN-NEXT: v_or_b32_e32 v7, v7, v15 |
| ; GCN-NEXT: v_or_b32_e32 v6, v6, v14 |
| ; GCN-NEXT: v_or_b32_e32 v5, v5, v13 |
| ; GCN-NEXT: v_or_b32_e32 v4, v4, v12 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v11 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v10 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v9 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v8 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_v8bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 |
| ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 |
| ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 |
| ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v12 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v13, 16, v13 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v14 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v15 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_and_b32_e32 v15, 0x8000, v15 |
| ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v14, 0x8000, v14 |
| ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v13, 0x8000, v13 |
| ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v12, 0x8000, v12 |
| ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v11, 0x8000, v11 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v10, 0x8000, v10 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v9, 0x8000, v9 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v8, 0x8000, v8 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v7, v7, v15 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v6, v14 |
| ; GFX7-NEXT: v_or_b32_e32 v5, v5, v13 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v4, v12 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v11 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_v8bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v6 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_v8bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_v8bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_v8bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5 |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6 |
| ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %result = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %mag, <8 x bfloat> %sign) |
| ret <8 x bfloat> %result |
| } |
| |
| define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_v16bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 |
| ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 |
| ; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v30, 0x8000, v30 |
| ; GCN-NEXT: v_or_b32_e32 v14, v14, v30 |
| ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 |
| ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 |
| ; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v29, 0x8000, v29 |
| ; GCN-NEXT: v_or_b32_e32 v13, v13, v29 |
| ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 |
| ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 |
| ; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v28, 0x8000, v28 |
| ; GCN-NEXT: v_or_b32_e32 v12, v12, v28 |
| ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 |
| ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 |
| ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 |
| ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 |
| ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 |
| ; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v27, 0x8000, v27 |
| ; GCN-NEXT: v_or_b32_e32 v11, v11, v27 |
| ; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 |
| ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 |
| ; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 |
| ; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 |
| ; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 |
| ; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 |
| ; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 |
| ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 |
| ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v26, 0x8000, v26 |
| ; GCN-NEXT: v_and_b32_e32 v25, 0x8000, v25 |
| ; GCN-NEXT: v_and_b32_e32 v24, 0x8000, v24 |
| ; GCN-NEXT: v_and_b32_e32 v23, 0x8000, v23 |
| ; GCN-NEXT: v_and_b32_e32 v22, 0x8000, v22 |
| ; GCN-NEXT: v_and_b32_e32 v21, 0x8000, v21 |
| ; GCN-NEXT: v_and_b32_e32 v20, 0x8000, v20 |
| ; GCN-NEXT: v_and_b32_e32 v19, 0x8000, v19 |
| ; GCN-NEXT: v_and_b32_e32 v18, 0x8000, v18 |
| ; GCN-NEXT: v_and_b32_e32 v17, 0x8000, v17 |
| ; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 |
| ; GCN-NEXT: v_or_b32_e32 v10, v10, v26 |
| ; GCN-NEXT: v_or_b32_e32 v9, v9, v25 |
| ; GCN-NEXT: v_or_b32_e32 v8, v8, v24 |
| ; GCN-NEXT: v_or_b32_e32 v7, v7, v23 |
| ; GCN-NEXT: v_or_b32_e32 v6, v6, v22 |
| ; GCN-NEXT: v_or_b32_e32 v5, v5, v21 |
| ; GCN-NEXT: v_or_b32_e32 v4, v4, v20 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v19 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v18 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v17 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v16 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v27 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 |
| ; GCN-NEXT: v_and_b32_e32 v16, 0x8000, v16 |
| ; GCN-NEXT: v_or_b32_e32 v15, v15, v16 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_v16bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v27 |
| ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GFX7-NEXT: v_and_b32_e32 v27, 0x8000, v27 |
| ; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v11, v11, v27 |
| ; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 |
| ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 |
| ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 |
| ; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 |
| ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 |
| ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 |
| ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 |
| ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 |
| ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 |
| ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 |
| ; GFX7-NEXT: v_or_b32_e32 v8, v8, v24 |
| ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 |
| ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 |
| ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 |
| ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 |
| ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 |
| ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 |
| ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v30, 16, v30 |
| ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v29, 16, v29 |
| ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v28, 16, v28 |
| ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 |
| ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v26, 16, v26 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v25, 16, v25 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v23 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v22, 16, v22 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v21, 16, v21 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v20, 16, v20 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v19 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v18, 16, v18 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v17 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v16, 16, v16 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_and_b32_e32 v30, 0x8000, v30 |
| ; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v29, 0x8000, v29 |
| ; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v28, 0x8000, v28 |
| ; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15 |
| ; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15 |
| ; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 |
| ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v26, 0x8000, v26 |
| ; GFX7-NEXT: v_and_b32_e32 v25, 0x8000, v25 |
| ; GFX7-NEXT: v_and_b32_e32 v23, 0x8000, v23 |
| ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v22, 0x8000, v22 |
| ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v21, 0x8000, v21 |
| ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v20, 0x8000, v20 |
| ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v19, 0x8000, v19 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v18, 0x8000, v18 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v17, 0x8000, v17 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v16, 0x8000, v16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v14, v14, v30 |
| ; GFX7-NEXT: v_or_b32_e32 v13, v13, v29 |
| ; GFX7-NEXT: v_or_b32_e32 v12, v12, v28 |
| ; GFX7-NEXT: v_or_b32_e32 v10, v10, v26 |
| ; GFX7-NEXT: v_or_b32_e32 v9, v9, v25 |
| ; GFX7-NEXT: v_or_b32_e32 v7, v7, v23 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v27 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v24, 16, v24 |
| ; GFX7-NEXT: v_and_b32_e32 v24, 0x8000, v24 |
| ; GFX7-NEXT: v_or_b32_e32 v15, v15, v24 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v6, v22 |
| ; GFX7-NEXT: v_or_b32_e32 v5, v5, v21 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v4, v20 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v19 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v17 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v16 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_v16bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v8 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v9 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v10 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v11 |
| ; GFX8-NEXT: v_bfi_b32 v4, s4, v4, v12 |
| ; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v13 |
| ; GFX8-NEXT: v_bfi_b32 v6, s4, v6, v14 |
| ; GFX8-NEXT: v_bfi_b32 v7, s4, v7, v15 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_v16bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v8 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v9 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v10 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v11 |
| ; GFX9-NEXT: v_bfi_b32 v4, s4, v4, v12 |
| ; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v13 |
| ; GFX9-NEXT: v_bfi_b32 v6, s4, v6, v14 |
| ; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v15 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_v16bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v10 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v11 |
| ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff7fff, v4, v12 |
| ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff7fff, v5, v13 |
| ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff7fff, v6, v14 |
| ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff7fff, v7, v15 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_v16bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9 |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v10 |
| ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v11 |
| ; GFX11-NEXT: v_bfi_b32 v4, 0x7fff7fff, v4, v12 |
| ; GFX11-NEXT: v_bfi_b32 v5, 0x7fff7fff, v5, v13 |
| ; GFX11-NEXT: v_bfi_b32 v6, 0x7fff7fff, v6, v14 |
| ; GFX11-NEXT: v_bfi_b32 v7, 0x7fff7fff, v7, v15 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %result = call <16 x bfloat> @llvm.copysign.v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign) |
| ret <16 x bfloat> %result |
| } |
| |
| define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_v32bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 |
| ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 |
| ; GCN-NEXT: s_waitcnt vmcnt(1) |
| ; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 |
| ; GCN-NEXT: v_bfe_u32 v32, v32, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v31, 0x8000, v31 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 |
| ; GCN-NEXT: v_or_b32_e32 v31, v32, v31 |
| ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 |
| ; GCN-NEXT: v_bfe_u32 v30, v30, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 |
| ; GCN-NEXT: v_or_b32_e32 v30, v30, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 |
| ; GCN-NEXT: v_bfe_u32 v29, v29, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 |
| ; GCN-NEXT: v_or_b32_e32 v29, v29, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 |
| ; GCN-NEXT: v_bfe_u32 v28, v28, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 |
| ; GCN-NEXT: v_or_b32_e32 v28, v28, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 |
| ; GCN-NEXT: v_bfe_u32 v27, v27, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 |
| ; GCN-NEXT: v_or_b32_e32 v27, v27, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 |
| ; GCN-NEXT: v_bfe_u32 v26, v26, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 |
| ; GCN-NEXT: v_or_b32_e32 v26, v26, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 |
| ; GCN-NEXT: v_bfe_u32 v25, v25, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 |
| ; GCN-NEXT: v_or_b32_e32 v25, v25, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 |
| ; GCN-NEXT: v_bfe_u32 v24, v24, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 |
| ; GCN-NEXT: v_or_b32_e32 v24, v24, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 |
| ; GCN-NEXT: v_bfe_u32 v23, v23, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 |
| ; GCN-NEXT: v_or_b32_e32 v23, v23, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 |
| ; GCN-NEXT: v_bfe_u32 v22, v22, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 |
| ; GCN-NEXT: v_or_b32_e32 v22, v22, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 |
| ; GCN-NEXT: v_bfe_u32 v21, v21, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 |
| ; GCN-NEXT: v_or_b32_e32 v21, v21, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 |
| ; GCN-NEXT: v_bfe_u32 v20, v20, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 |
| ; GCN-NEXT: v_or_b32_e32 v20, v20, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 |
| ; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 |
| ; GCN-NEXT: v_or_b32_e32 v19, v19, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 |
| ; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 |
| ; GCN-NEXT: v_or_b32_e32 v18, v18, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 |
| ; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 |
| ; GCN-NEXT: v_or_b32_e32 v17, v17, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 |
| ; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 |
| ; GCN-NEXT: v_or_b32_e32 v16, v16, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 |
| ; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 |
| ; GCN-NEXT: v_or_b32_e32 v15, v15, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 |
| ; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 |
| ; GCN-NEXT: v_or_b32_e32 v14, v14, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 |
| ; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 |
| ; GCN-NEXT: v_or_b32_e32 v13, v13, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 |
| ; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 |
| ; GCN-NEXT: v_or_b32_e32 v12, v12, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 |
| ; GCN-NEXT: v_or_b32_e32 v11, v11, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 |
| ; GCN-NEXT: v_or_b32_e32 v10, v10, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 |
| ; GCN-NEXT: v_or_b32_e32 v9, v9, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 |
| ; GCN-NEXT: v_or_b32_e32 v8, v8, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 |
| ; GCN-NEXT: v_or_b32_e32 v7, v7, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 |
| ; GCN-NEXT: v_or_b32_e32 v6, v6, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 |
| ; GCN-NEXT: v_or_b32_e32 v5, v5, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 |
| ; GCN-NEXT: v_or_b32_e32 v4, v4, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v32 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v32 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_v32bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 |
| ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 |
| ; GFX7-NEXT: v_bfe_u32 v30, v30, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 |
| ; GFX7-NEXT: v_bfe_u32 v29, v29, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 |
| ; GFX7-NEXT: v_bfe_u32 v28, v28, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 |
| ; GFX7-NEXT: v_bfe_u32 v27, v27, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 |
| ; GFX7-NEXT: v_bfe_u32 v26, v26, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 |
| ; GFX7-NEXT: v_bfe_u32 v25, v25, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 |
| ; GFX7-NEXT: v_bfe_u32 v24, v24, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 |
| ; GFX7-NEXT: v_bfe_u32 v23, v23, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 |
| ; GFX7-NEXT: v_bfe_u32 v22, v22, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 |
| ; GFX7-NEXT: v_bfe_u32 v21, v21, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 |
| ; GFX7-NEXT: v_bfe_u32 v20, v20, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 |
| ; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 |
| ; GFX7-NEXT: v_bfe_u32 v18, v18, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 |
| ; GFX7-NEXT: v_bfe_u32 v17, v17, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 |
| ; GFX7-NEXT: v_bfe_u32 v16, v16, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 |
| ; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 |
| ; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 |
| ; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 |
| ; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: s_waitcnt vmcnt(1) |
| ; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31 |
| ; GFX7-NEXT: v_bfe_u32 v32, v32, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v31, 0x8000, v31 |
| ; GFX7-NEXT: v_or_b32_e32 v31, v32, v31 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v30, v30, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v29, v29, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v28, v28, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v27, v27, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v26, v26, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v25, v25, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v24, v24, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v24, 16, v24 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v23, v23, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v22, v22, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v22, 16, v22 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v21, v21, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v20, v20, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v20, 16, v20 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v19, v19, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v18, v18, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v18, 16, v18 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v17, v17, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v16, v16, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v16 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v15, v15, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v14, v14, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v13, v13, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v12, v12, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v11, v11, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v10, v10, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v9, v9, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v8, v8, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v7, v7, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v6, v6, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v5, v5, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v4, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v32 |
| ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32 |
| ; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v32 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_v32bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v16 |
| ; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v17 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v18 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v19 |
| ; GFX8-NEXT: v_bfi_b32 v4, s4, v4, v20 |
| ; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v21 |
| ; GFX8-NEXT: v_bfi_b32 v6, s4, v6, v22 |
| ; GFX8-NEXT: v_bfi_b32 v7, s4, v7, v23 |
| ; GFX8-NEXT: v_bfi_b32 v8, s4, v8, v24 |
| ; GFX8-NEXT: v_bfi_b32 v9, s4, v9, v25 |
| ; GFX8-NEXT: v_bfi_b32 v10, s4, v10, v26 |
| ; GFX8-NEXT: v_bfi_b32 v11, s4, v11, v27 |
| ; GFX8-NEXT: v_bfi_b32 v12, s4, v12, v28 |
| ; GFX8-NEXT: v_bfi_b32 v13, s4, v13, v29 |
| ; GFX8-NEXT: v_bfi_b32 v14, s4, v14, v30 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: v_bfi_b32 v15, s4, v15, v16 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_v32bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v16 |
| ; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v17 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v18 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v19 |
| ; GFX9-NEXT: v_bfi_b32 v4, s4, v4, v20 |
| ; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v21 |
| ; GFX9-NEXT: v_bfi_b32 v6, s4, v6, v22 |
| ; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v23 |
| ; GFX9-NEXT: v_bfi_b32 v8, s4, v8, v24 |
| ; GFX9-NEXT: v_bfi_b32 v9, s4, v9, v25 |
| ; GFX9-NEXT: v_bfi_b32 v10, s4, v10, v26 |
| ; GFX9-NEXT: v_bfi_b32 v11, s4, v11, v27 |
| ; GFX9-NEXT: v_bfi_b32 v12, s4, v12, v28 |
| ; GFX9-NEXT: v_bfi_b32 v13, s4, v13, v29 |
| ; GFX9-NEXT: v_bfi_b32 v14, s4, v14, v30 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_bfi_b32 v15, s4, v15, v16 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_v32bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v16 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v17 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v18 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v19 |
| ; GFX10-NEXT: v_bfi_b32 v4, 0x7fff7fff, v4, v20 |
| ; GFX10-NEXT: v_bfi_b32 v5, 0x7fff7fff, v5, v21 |
| ; GFX10-NEXT: v_bfi_b32 v6, 0x7fff7fff, v6, v22 |
| ; GFX10-NEXT: v_bfi_b32 v7, 0x7fff7fff, v7, v23 |
| ; GFX10-NEXT: v_bfi_b32 v8, 0x7fff7fff, v8, v24 |
| ; GFX10-NEXT: v_bfi_b32 v9, 0x7fff7fff, v9, v25 |
| ; GFX10-NEXT: v_bfi_b32 v10, 0x7fff7fff, v10, v26 |
| ; GFX10-NEXT: v_bfi_b32 v11, 0x7fff7fff, v11, v27 |
| ; GFX10-NEXT: v_bfi_b32 v12, 0x7fff7fff, v12, v28 |
| ; GFX10-NEXT: v_bfi_b32 v13, 0x7fff7fff, v13, v29 |
| ; GFX10-NEXT: v_bfi_b32 v14, 0x7fff7fff, v14, v30 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_bfi_b32 v15, 0x7fff7fff, v15, v31 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_v32bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: scratch_load_b32 v31, off, s32 |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v16 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v17 |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v18 |
| ; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v19 |
| ; GFX11-NEXT: v_bfi_b32 v4, 0x7fff7fff, v4, v20 |
| ; GFX11-NEXT: v_bfi_b32 v5, 0x7fff7fff, v5, v21 |
| ; GFX11-NEXT: v_bfi_b32 v6, 0x7fff7fff, v6, v22 |
| ; GFX11-NEXT: v_bfi_b32 v7, 0x7fff7fff, v7, v23 |
| ; GFX11-NEXT: v_bfi_b32 v8, 0x7fff7fff, v8, v24 |
| ; GFX11-NEXT: v_bfi_b32 v9, 0x7fff7fff, v9, v25 |
| ; GFX11-NEXT: v_bfi_b32 v10, 0x7fff7fff, v10, v26 |
| ; GFX11-NEXT: v_bfi_b32 v11, 0x7fff7fff, v11, v27 |
| ; GFX11-NEXT: v_bfi_b32 v12, 0x7fff7fff, v12, v28 |
| ; GFX11-NEXT: v_bfi_b32 v13, 0x7fff7fff, v13, v29 |
| ; GFX11-NEXT: v_bfi_b32 v14, 0x7fff7fff, v14, v30 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_bfi_b32 v15, 0x7fff7fff, v15, v31 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %result = call <32 x bfloat> @llvm.copysign.v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) |
| ret <32 x bfloat> %result |
| } |
| |
| define amdgpu_ps i32 @s_copysign_out_f32_mag_bf16_sign_f32(bfloat inreg %mag, float inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_brev_b32 s2, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v0, s0 |
| ; GCN-NEXT: v_mov_b32_e32 v1, s1 |
| ; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_brev_b32 s2, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX7-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX8-NEXT: s_brev_b32 s2, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX9-NEXT: s_brev_b32 s2, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_f32_mag_bf16_sign_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %mag.ext = fpext bfloat %mag to float |
| %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) |
| %cast = bitcast float %out to i32 |
| ret i32 %cast |
| } |
| |
| define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_bf16_sign_f64(bfloat inreg %mag, double inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 |
| ; GCN-NEXT: s_brev_b32 s0, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v2, s2 |
| ; GCN-NEXT: v_bfi_b32 v1, s0, v1, v2 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 |
| ; GFX7-NEXT: s_brev_b32 s0, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s0, v1, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 |
| ; GFX8-NEXT: s_brev_b32 s0, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s0, v1, v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 |
| ; GFX9-NEXT: s_brev_b32 s0, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_f64_mag_bf16_sign_f64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_lshl_b32 s0, s0, 16 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], s0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %mag.ext = fpext bfloat %mag to double |
| %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) |
| %cast = bitcast double %out to <2 x i32> |
| ret <2 x i32> %cast |
| } |
| |
| define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfloat inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_brev_b32 s2, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v0, s0 |
| ; GCN-NEXT: v_mov_b32_e32 v1, s1 |
| ; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_brev_b32 s2, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX7-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX8-NEXT: s_brev_b32 s1, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX9-NEXT: s_brev_b32 s1, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.ext = fpext bfloat %sign to float |
| %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) |
| %cast = bitcast float %out to i32 |
| ret i32 %cast |
| } |
| |
| define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %mag, bfloat inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_brev_b32 s3, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v0, s1 |
| ; GCN-NEXT: v_mov_b32_e32 v1, s2 |
| ; GCN-NEXT: v_bfi_b32 v0, s3, v0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_brev_b32 s3, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s3, v0, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX8-NEXT: s_brev_b32 s2, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX9-NEXT: s_brev_b32 s2, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.ext = fpext bfloat %sign to double |
| %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) |
| %cast = bitcast double %out to <2 x i32> |
| ret <2 x i32> %cast |
| } |
| |
| define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, float inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX8-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX9-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.trunc = fptrunc float %sign to bfloat |
| %out = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign.trunc) |
| %cast = bitcast bfloat %out to i16 |
| ret i16 %cast |
| } |
| |
| define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, double inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s2, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX8-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX9-NEXT: s_movk_i32 s1, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.trunc = fptrunc double %sign to bfloat |
| %out = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign.trunc) |
| %cast = bitcast bfloat %out to i16 |
| ret i16 %cast |
| } |
| |
| define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bfloat inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_bfe_u32 s2, s0, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s2, s2, s0 |
| ; GFX8-NEXT: s_or_b32 s4, s0, 0x400000 |
| ; GFX8-NEXT: s_add_i32 s6, s2, 0x7fff |
| ; GFX8-NEXT: v_cmp_u_f32_e64 s[2:3], s0, s0 |
| ; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec |
| ; GFX8-NEXT: s_cselect_b32 s0, s4, s6 |
| ; GFX8-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX8-NEXT: s_movk_i32 s5, 0x7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_bfe_u32 s2, s0, 0x10010 |
| ; GFX9-NEXT: s_add_i32 s2, s2, s0 |
| ; GFX9-NEXT: s_or_b32 s4, s0, 0x400000 |
| ; GFX9-NEXT: s_add_i32 s6, s2, 0x7fff |
| ; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s0, s0 |
| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec |
| ; GFX9-NEXT: s_cselect_b32 s0, s4, s6 |
| ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX9-NEXT: s_movk_i32 s5, 0x7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_bfe_u32 s2, s0, 0x10010 |
| ; GFX10-NEXT: v_cmp_u_f32_e64 s3, s0, s0 |
| ; GFX10-NEXT: s_add_i32 s2, s2, s0 |
| ; GFX10-NEXT: s_bitset1_b32 s0, 22 |
| ; GFX10-NEXT: s_addk_i32 s2, 0x7fff |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: s_and_b32 s3, s3, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s0, s0, s2 |
| ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_bfe_u32 s2, s0, 0x10010 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e64 s3, s0, s0 |
| ; GFX11TRUE16-NEXT: s_add_i32 s2, s2, s0 |
| ; GFX11TRUE16-NEXT: s_bitset1_b32 s0, 22 |
| ; GFX11TRUE16-NEXT: s_addk_i32 s2, 0x7fff |
| ; GFX11TRUE16-NEXT: s_and_b32 s3, s3, exec_lo |
| ; GFX11TRUE16-NEXT: s_cselect_b32 s0, s0, s2 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11TRUE16-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX11TRUE16-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s1 |
| ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11TRUE16-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11FAKE16-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_bfe_u32 s2, s0, 0x10010 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e64 s3, s0, s0 |
| ; GFX11FAKE16-NEXT: s_add_i32 s2, s2, s0 |
| ; GFX11FAKE16-NEXT: s_bitset1_b32 s0, 22 |
| ; GFX11FAKE16-NEXT: s_addk_i32 s2, 0x7fff |
| ; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11FAKE16-NEXT: s_and_b32 s3, s3, exec_lo |
| ; GFX11FAKE16-NEXT: s_cselect_b32 s0, s0, s2 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11FAKE16-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11FAKE16-NEXT: ; return to shader part epilog |
| %mag.trunc = fptrunc float %mag to bfloat |
| %out = call bfloat @llvm.copysign.bf16(bfloat %mag.trunc, bfloat %sign) |
| %cast = bitcast bfloat %out to i16 |
| ret i16 %cast |
| } |
| |
| define <2 x float> @v_copysign_out_v2f32_mag_v2bf16_sign_v2f32(<2 x bfloat> %mag, <2 x float> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_and_b32_e32 v3, 0x7fff0000, v0 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v3, v2 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff0000, v0 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v3, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v4, 0x7fff0000, v0 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v3, v1 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v2 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v4, 0x7fff0000, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v3, v1 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v2 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %mag.ext = fpext <2 x bfloat> %mag to <2 x float> |
| %out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag.ext, <2 x float> %sign) |
| ret <2 x float> %out |
| } |
| |
| define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x float> %mag, <2 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.ext = fpext <2 x bfloat> %sign to <2 x float> |
| %out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign.ext) |
| ret <2 x float> %out |
| } |
| |
| define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %mag, <2 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GCN-NEXT: v_bfi_b32 v3, s4, v3, v5 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v5 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v4 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v4 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v4 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v4 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.ext = fpext <2 x bfloat> %sign to <2 x double> |
| %out = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign.ext) |
| ret <2 x double> %out |
| } |
| |
| define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> %mag, <2 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 |
| ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v0 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc |
| ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 |
| ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc |
| ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 |
| ; GFX10-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v1 |
| ; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v1, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo |
| ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff |
| ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff |
| ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %mag.trunc = fptrunc <2 x float> %mag to <2 x bfloat> |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag.trunc, <2 x bfloat> %sign) |
| ret <2 x bfloat> %out |
| } |
| |
| define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> %mag, <2 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v7, v[0:1] |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v8, v[2:3] |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v7 |
| ; GFX8-NEXT: v_and_b32_e32 v9, 1, v7 |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6] |
| ; GFX8-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v5, s[6:7], v7, v5 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v7, v5, v7, vcc |
| ; GFX8-NEXT: v_bfe_u32 v5, v7, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v9, vcc, v5, v7 |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 |
| ; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[5:6]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[5:6] |
| ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 1, v8 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v0, v9, v7, s[4:5] |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], v8, v1 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc |
| ; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 |
| ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] |
| ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v9, v[0:1] |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] |
| ; GFX9-NEXT: s_movk_i32 s8, 0x7fff |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v9 |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[7:8], v10 |
| ; GFX9-NEXT: v_and_b32_e32 v11, 1, v9 |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[5:6]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[5:6] |
| ; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v5, v9, v5 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[7:8]| |
| ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[7:8] |
| ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v5, s8 |
| ; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v5 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 1, v10 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v5, s[4:5] |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v1, v10, v1 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc |
| ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] |
| ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s8 |
| ; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v1 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc |
| ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v9, v[0:1] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[5:6], v9 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[7:8], v10 |
| ; GFX10-NEXT: v_and_b32_e32 v11, 1, v9 |
| ; GFX10-NEXT: v_and_b32_e32 v12, 1, v10 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v12 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, |v[5:6]| |
| ; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[5:6] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, v[2:3], v[7:8] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v5, -1, 1, s5 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[2:3]|, |v[7:8]| |
| ; GFX10-NEXT: v_add_nc_u32_e32 v5, v9, v5 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v6, -1, 1, s5 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v11 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v6, v10, v6 |
| ; GFX10-NEXT: s_or_b32 vcc_lo, vcc_lo, s5 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo |
| ; GFX10-NEXT: s_or_b32 vcc_lo, s4, s6 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5 |
| ; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1 |
| ; GFX10-NEXT: v_add3_u32 v5, v7, v5, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v6 |
| ; GFX10-NEXT: v_add3_u32 v6, v8, v6, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo |
| ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v9, v[0:1] |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[5:6], v9 |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[7:8], v10 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[5:6]| |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[5:6] |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s0, v[2:3], v[7:8] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v5, -1, 1, s1 |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[7:8]| |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v5, v9, v5 |
| ; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v10 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v6 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v7, -1, 1, s1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v6, v10, v7 |
| ; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v9 |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v11 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s1 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s0, s2 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v5 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v6 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v5, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc_lo |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v9, v[0:1] |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[5:6], v9 |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[7:8], v10 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[5:6]| |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[5:6] |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e64 s0, v[2:3], v[7:8] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v5, -1, 1, s1 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[7:8]| |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v5, v9, v5 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v6, -1, 1, s1 |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v6, v10, v6 |
| ; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v9 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v11 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s1 |
| ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v5, v5, v9 :: v_dual_and_b32 v12, 1, v10 |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v12 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, s0, s2 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v5, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v8, v6, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v6, v8, v6, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %mag.trunc = fptrunc <2 x double> %mag to <2 x bfloat> |
| %result = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag.trunc, <2 x bfloat> %sign) |
| ret <2 x bfloat> %result |
| } |
| |
| define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %mag, <2 x float> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 |
| ; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc |
| ; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 |
| ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX8-NEXT: v_alignbit_b32 v1, v2, v1, 16 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc |
| ; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1 |
| ; GFX10-NEXT: v_bfe_u32 v4, v2, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff |
| ; GFX10-NEXT: v_add3_u32 v4, v4, v2, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo |
| ; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff |
| ; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff |
| ; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat> |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign.trunc) |
| ret <2 x bfloat> %out |
| } |
| |
| define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %mag, <2 x double> %sign) { |
| ; GCN-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v2 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v4 |
| ; GFX8-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 |
| ; GFX9-NEXT: v_perm_b32 v1, v4, v2, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_perm_b32 v1, v4, v2, 0x5040100 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v4.l |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v2, 0x5040100 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat> |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign.trunc) |
| ret <2 x bfloat> %out |
| } |
| |
| define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2bf16_sign_v2f32(<2 x bfloat> inreg %mag, <2 x float> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GCN-NEXT: s_brev_b32 s0, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v2, s3 |
| ; GCN-NEXT: v_mov_b32_e32 v3, s2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GCN-NEXT: v_bfi_b32 v0, s0, v0, v2 |
| ; GCN-NEXT: v_bfi_b32 v1, s0, v1, v3 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GFX7-NEXT: s_brev_b32 s0, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s0, v1, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_lshl_b32 s3, s0, 16 |
| ; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff0000 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_lshl_b32 s3, s0, 16 |
| ; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff0000 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s3 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX10-NEXT: s_lshl_b32 s1, s0, 16 |
| ; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff0000 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s0, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_v2f32_mag_v2bf16_sign_v2f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s2 |
| ; GFX11-NEXT: s_lshl_b32 s1, s0, 16 |
| ; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff0000 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s0, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %mag.ext = fpext <2 x bfloat> %mag to <2 x float> |
| %out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag.ext, <2 x float> %sign) |
| %cast = bitcast <2 x float> %out to <2 x i32> |
| ret <2 x i32> %cast |
| } |
| |
| define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x float> inreg %mag, <2 x bfloat> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s2 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s3 |
| ; GCN-NEXT: s_brev_b32 s2, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v2, s1 |
| ; GCN-NEXT: v_mov_b32_e32 v3, s0 |
| ; GCN-NEXT: v_bfi_b32 v1, s2, v2, v1 |
| ; GCN-NEXT: v_bfi_b32 v0, s2, v3, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s2 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s3 |
| ; GFX7-NEXT: s_brev_b32 s2, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX7-NEXT: v_bfi_b32 v1, s2, v2, v1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s0 |
| ; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_lshr_b32 s3, s2, 16 |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX8-NEXT: s_brev_b32 s2, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v1, v0 |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s3 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX8-NEXT: v_bfi_b32 v1, s2, v2, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX9-NEXT: s_brev_b32 s2, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s3 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s1 |
| ; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_lshr_b32 s3, s2, 16 |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s3 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_lshr_b32 s3, s2, 16 |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.ext = fpext <2 x bfloat> %sign to <2 x float> |
| %out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign.ext) |
| %cast = bitcast <2 x float> %out to <2 x i32> |
| ret <2 x i32> %cast |
| } |
| |
| define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_brev_b32 s6, -2 |
| ; GCN-NEXT: v_mov_b32_e32 v0, s3 |
| ; GCN-NEXT: v_mov_b32_e32 v1, s5 |
| ; GCN-NEXT: v_mov_b32_e32 v2, s1 |
| ; GCN-NEXT: v_mov_b32_e32 v3, s4 |
| ; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1 |
| ; GCN-NEXT: v_bfi_b32 v1, s6, v2, v3 |
| ; GCN-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s3, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_brev_b32 s6, -2 |
| ; GFX7-NEXT: v_mov_b32_e32 v0, s3 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s5 |
| ; GFX7-NEXT: v_bfi_b32 v0, s6, v0, v1 |
| ; GFX7-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX7-NEXT: v_mov_b32_e32 v2, s4 |
| ; GFX7-NEXT: v_bfi_b32 v1, s6, v1, v2 |
| ; GFX7-NEXT: v_readfirstlane_b32 s1, v1 |
| ; GFX7-NEXT: v_readfirstlane_b32 s3, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s4 |
| ; GFX8-NEXT: s_brev_b32 s5, -2 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: s_lshr_b32 s1, s4, 16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0 |
| ; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s1 |
| ; GFX8-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX8-NEXT: v_bfi_b32 v1, s5, v2, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4 |
| ; GFX9-NEXT: s_brev_b32 s5, -2 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: s_lshr_b32 s1, s4, 16 |
| ; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0 |
| ; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, s3 |
| ; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s4 |
| ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 |
| ; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s4 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 |
| ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 |
| ; GFX11-NEXT: s_lshr_b32 s4, s4, 16 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4 |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 |
| ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11-NEXT: v_readfirstlane_b32 s3, v1 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.ext = fpext <2 x bfloat> %sign to <2 x double> |
| %out = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign.ext) |
| %cast = bitcast <2 x double> %out to <4 x i32> |
| ret <4 x i32> %cast |
| } |
| |
| define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> inreg %mag, <2 x bfloat> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s2 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s2 |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s4, s4, s0 |
| ; GFX8-NEXT: s_or_b32 s3, s0, 0x400000 |
| ; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff |
| ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0 |
| ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec |
| ; GFX8-NEXT: s_cselect_b32 s3, s3, s6 |
| ; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s0, s0, s1 |
| ; GFX8-NEXT: s_or_b32 s4, s1, 0x400000 |
| ; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff |
| ; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1 |
| ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec |
| ; GFX8-NEXT: s_cselect_b32 s0, s4, s5 |
| ; GFX8-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s3 |
| ; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 |
| ; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_bfe_u32 s4, s1, 0x10010 |
| ; GFX9-NEXT: s_add_i32 s4, s4, s1 |
| ; GFX9-NEXT: s_or_b32 s3, s1, 0x400000 |
| ; GFX9-NEXT: s_add_i32 s6, s4, 0x7fff |
| ; GFX9-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 |
| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec |
| ; GFX9-NEXT: s_cselect_b32 s1, s3, s6 |
| ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 |
| ; GFX9-NEXT: s_bfe_u32 s1, s0, 0x10010 |
| ; GFX9-NEXT: s_add_i32 s1, s1, s0 |
| ; GFX9-NEXT: s_or_b32 s4, s0, 0x400000 |
| ; GFX9-NEXT: s_add_i32 s5, s1, 0x7fff |
| ; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 |
| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec |
| ; GFX9-NEXT: s_cselect_b32 s0, s4, s5 |
| ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 |
| ; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_bfe_u32 s3, s1, 0x10010 |
| ; GFX10-NEXT: v_cmp_u_f32_e64 s4, s1, s1 |
| ; GFX10-NEXT: s_add_i32 s3, s3, s1 |
| ; GFX10-NEXT: s_bitset1_b32 s1, 22 |
| ; GFX10-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s1, s1, s3 |
| ; GFX10-NEXT: s_bfe_u32 s3, s0, 0x10010 |
| ; GFX10-NEXT: v_cmp_u_f32_e64 s4, s0, s0 |
| ; GFX10-NEXT: s_add_i32 s3, s3, s0 |
| ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX10-NEXT: s_bitset1_b32 s0, 22 |
| ; GFX10-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s0, s0, s3 |
| ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_bfe_u32 s3, s1, 0x10010 |
| ; GFX11-NEXT: v_cmp_u_f32_e64 s4, s1, s1 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s1 |
| ; GFX11-NEXT: s_bitset1_b32 s1, 22 |
| ; GFX11-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s2 |
| ; GFX11-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s1, s1, s3 |
| ; GFX11-NEXT: s_bfe_u32 s3, s0, 0x10010 |
| ; GFX11-NEXT: v_cmp_u_f32_e64 s4, s0, s0 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s0 |
| ; GFX11-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX11-NEXT: s_bitset1_b32 s0, 22 |
| ; GFX11-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX11-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s0, s0, s3 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %mag.trunc = fptrunc <2 x float> %mag to <2 x bfloat> |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag.trunc, <2 x bfloat> %sign) |
| %cast = bitcast <2 x bfloat> %out to i32 |
| ret i32 %cast |
| } |
| |
| define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v3, s[0:1] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GCN-NEXT: v_or_b32_e32 v1, v3, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, v2, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v3, s[2:3] |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 |
| ; GFX7-NEXT: v_bfe_u32 v2, v3, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v2, s[0:1] |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v3, s[2:3] |
| ; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], s[0:1], s[0:1] |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 |
| ; GFX8-NEXT: v_readfirstlane_b32 s5, v2 |
| ; GFX8-NEXT: s_bitcmp1_b32 s5, 0 |
| ; GFX8-NEXT: s_cselect_b64 s[10:11], -1, 0 |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, s[0:1], v[0:1] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], |s[0:1]|, |v[0:1]| |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 |
| ; GFX8-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[2:3], v[0:1] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[12:13], |s[2:3]|, |v[0:1]| |
| ; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11] |
| ; GFX8-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX8-NEXT: s_cselect_b32 s8, 1, -1 |
| ; GFX8-NEXT: s_add_i32 s14, s5, s8 |
| ; GFX8-NEXT: s_and_b64 s[8:9], s[10:11], exec |
| ; GFX8-NEXT: s_cselect_b32 s5, s5, s14 |
| ; GFX8-NEXT: s_bfe_u32 s8, s5, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s8, s8, s5 |
| ; GFX8-NEXT: s_addk_i32 s8, 0x7fff |
| ; GFX8-NEXT: s_bitset1_b32 s5, 22 |
| ; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec |
| ; GFX8-NEXT: s_cselect_b32 s5, s5, s8 |
| ; GFX8-NEXT: v_readfirstlane_b32 s8, v3 |
| ; GFX8-NEXT: s_bitcmp1_b32 s8, 0 |
| ; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0 |
| ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] |
| ; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec |
| ; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3] |
| ; GFX8-NEXT: s_cselect_b32 s6, 1, -1 |
| ; GFX8-NEXT: s_add_i32 s6, s8, s6 |
| ; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec |
| ; GFX8-NEXT: s_cselect_b32 s0, s8, s6 |
| ; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s1, s1, s0 |
| ; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff |
| ; GFX8-NEXT: s_or_b32 s7, s0, 0x400000 |
| ; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec |
| ; GFX8-NEXT: s_cselect_b32 s0, s7, s6 |
| ; GFX8-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s5 |
| ; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16 |
| ; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v2, s[2:3] |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v3, s[0:1] |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], s[2:3], s[2:3] |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 |
| ; GFX9-NEXT: v_readfirstlane_b32 s5, v2 |
| ; GFX9-NEXT: s_bitcmp1_b32 s5, 0 |
| ; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, s[2:3], v[0:1] |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |s[2:3]|, |v[0:1]| |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 |
| ; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[0:1], v[0:1] |
| ; GFX9-NEXT: s_or_b64 s[10:11], vcc, s[10:11] |
| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec |
| ; GFX9-NEXT: s_cselect_b32 s6, 1, -1 |
| ; GFX9-NEXT: s_add_i32 s12, s5, s6 |
| ; GFX9-NEXT: s_and_b64 s[6:7], s[10:11], exec |
| ; GFX9-NEXT: s_cselect_b32 s5, s5, s12 |
| ; GFX9-NEXT: s_bfe_u32 s6, s5, 0x10010 |
| ; GFX9-NEXT: s_or_b32 s10, s5, 0x400000 |
| ; GFX9-NEXT: s_add_i32 s5, s6, s5 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |s[0:1]|, |v[0:1]| |
| ; GFX9-NEXT: s_addk_i32 s5, 0x7fff |
| ; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec |
| ; GFX9-NEXT: s_cselect_b32 s5, s10, s5 |
| ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 |
| ; GFX9-NEXT: v_readfirstlane_b32 s10, v3 |
| ; GFX9-NEXT: s_bitcmp1_b32 s10, 0 |
| ; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 |
| ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] |
| ; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1] |
| ; GFX9-NEXT: s_cselect_b32 s6, 1, -1 |
| ; GFX9-NEXT: s_add_i32 s6, s10, s6 |
| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec |
| ; GFX9-NEXT: s_cselect_b32 s2, s10, s6 |
| ; GFX9-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX9-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX9-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX9-NEXT: s_bitset1_b32 s2, 22 |
| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec |
| ; GFX9-NEXT: s_cselect_b32 s0, s2, s3 |
| ; GFX9-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 |
| ; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s4 |
| ; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v4, s[2:3] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v5, s[0:1] |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 |
| ; GFX10-NEXT: v_readfirstlane_b32 s6, v4 |
| ; GFX10-NEXT: s_bitcmp1_b32 s6, 0 |
| ; GFX10-NEXT: s_cselect_b32 s7, -1, 0 |
| ; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, s[2:3], v[0:1] |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |s[2:3]|, |v[0:1]| |
| ; GFX10-NEXT: v_cmp_u_f64_e64 s3, s[2:3], s[2:3] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s2, s[0:1], v[2:3] |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s8, |s[0:1]|, |v[2:3]| |
| ; GFX10-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[0:1] |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX10-NEXT: s_or_b32 s7, vcc_lo, s7 |
| ; GFX10-NEXT: s_and_b32 s5, s5, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s5, 1, -1 |
| ; GFX10-NEXT: s_add_i32 s5, s6, s5 |
| ; GFX10-NEXT: s_and_b32 s7, s7, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s5, s6, s5 |
| ; GFX10-NEXT: v_readfirstlane_b32 s6, v5 |
| ; GFX10-NEXT: s_bfe_u32 s1, s5, 0x10010 |
| ; GFX10-NEXT: s_add_i32 s1, s1, s5 |
| ; GFX10-NEXT: s_bitset1_b32 s5, 22 |
| ; GFX10-NEXT: s_addk_i32 s1, 0x7fff |
| ; GFX10-NEXT: s_and_b32 s3, s3, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s1, s5, s1 |
| ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX10-NEXT: s_bitcmp1_b32 s6, 0 |
| ; GFX10-NEXT: s_cselect_b32 s3, -1, 0 |
| ; GFX10-NEXT: s_or_b32 s2, s2, s3 |
| ; GFX10-NEXT: s_and_b32 s3, s8, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s3, 1, -1 |
| ; GFX10-NEXT: s_add_i32 s3, s6, s3 |
| ; GFX10-NEXT: s_and_b32 s2, s2, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s2, s6, s3 |
| ; GFX10-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX10-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX10-NEXT: s_bitset1_b32 s2, 22 |
| ; GFX10-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX10-NEXT: s_and_b32 s0, s0, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s0, s2, s3 |
| ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: v_cvt_f32_f64_e32 v4, s[2:3] |
| ; GFX11-NEXT: v_cvt_f32_f64_e32 v5, s[0:1] |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 |
| ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v4 |
| ; GFX11-NEXT: s_bitcmp1_b32 s7, 0 |
| ; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, s[2:3], v[0:1] |
| ; GFX11-NEXT: v_cmp_gt_f64_e64 s5, |s[2:3]|, |v[0:1]| |
| ; GFX11-NEXT: v_cmp_u_f64_e64 s3, s[2:3], s[2:3] |
| ; GFX11-NEXT: v_cmp_nlg_f64_e64 s2, s[0:1], v[2:3] |
| ; GFX11-NEXT: v_cmp_gt_f64_e64 s6, |s[0:1]|, |v[2:3]| |
| ; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[0:1] |
| ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1 |
| ; GFX11-NEXT: s_and_b32 s5, s5, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s5, 1, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_add_i32 s5, s7, s5 |
| ; GFX11-NEXT: s_and_b32 s1, s1, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s1, s7, s5 |
| ; GFX11-NEXT: v_readfirstlane_b32 s7, v5 |
| ; GFX11-NEXT: s_bfe_u32 s5, s1, 0x10010 |
| ; GFX11-NEXT: s_add_i32 s5, s5, s1 |
| ; GFX11-NEXT: s_bitset1_b32 s1, 22 |
| ; GFX11-NEXT: s_addk_i32 s5, 0x7fff |
| ; GFX11-NEXT: s_and_b32 s3, s3, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s1, s1, s5 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX11-NEXT: s_bitcmp1_b32 s7, 0 |
| ; GFX11-NEXT: s_cselect_b32 s3, -1, 0 |
| ; GFX11-NEXT: s_or_b32 s2, s2, s3 |
| ; GFX11-NEXT: s_and_b32 s3, s6, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s3, 1, -1 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_add_i32 s3, s7, s3 |
| ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s2, s7, s3 |
| ; GFX11-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX11-NEXT: s_bitset1_b32 s2, 22 |
| ; GFX11-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX11-NEXT: s_and_b32 s0, s0, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s0, s2, s3 |
| ; GFX11-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1 |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %mag.trunc = fptrunc <2 x double> %mag to <2 x bfloat> |
| %result = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag.trunc, <2 x bfloat> %sign) |
| %cast = bitcast <2 x bfloat> %result to i32 |
| ret i32 %cast |
| } |
| |
| define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> inreg %mag, <2 x float> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 |
| ; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s4, s4, s1 |
| ; GFX8-NEXT: s_or_b32 s3, s1, 0x400000 |
| ; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff |
| ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1 |
| ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec |
| ; GFX8-NEXT: s_cselect_b32 s1, s3, s6 |
| ; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX8-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX8-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2 |
| ; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec |
| ; GFX8-NEXT: s_cselect_b32 s2, s2, s3 |
| ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16 |
| ; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s0 |
| ; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX9-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX9-NEXT: s_or_b32 s4, s2, 0x400000 |
| ; GFX9-NEXT: s_add_i32 s5, s3, 0x7fff |
| ; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s2, s2 |
| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec |
| ; GFX9-NEXT: s_cselect_b32 s2, s4, s5 |
| ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 |
| ; GFX9-NEXT: s_bfe_u32 s2, s1, 0x10010 |
| ; GFX9-NEXT: s_add_i32 s2, s2, s1 |
| ; GFX9-NEXT: s_or_b32 s5, s1, 0x400000 |
| ; GFX9-NEXT: s_add_i32 s6, s2, 0x7fff |
| ; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 |
| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec |
| ; GFX9-NEXT: s_cselect_b32 s1, s5, s6 |
| ; GFX9-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 |
| ; GFX9-NEXT: s_mov_b32 s2, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX10-NEXT: v_cmp_u_f32_e64 s4, s2, s2 |
| ; GFX10-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX10-NEXT: s_bitset1_b32 s2, 22 |
| ; GFX10-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s2, s2, s3 |
| ; GFX10-NEXT: s_bfe_u32 s3, s1, 0x10010 |
| ; GFX10-NEXT: v_cmp_u_f32_e64 s4, s1, s1 |
| ; GFX10-NEXT: s_add_i32 s3, s3, s1 |
| ; GFX10-NEXT: s_lshr_b32 s2, s2, 16 |
| ; GFX10-NEXT: s_bitset1_b32 s1, 22 |
| ; GFX10-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX10-NEXT: s_cselect_b32 s1, s1, s3 |
| ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_bfe_u32 s3, s2, 0x10010 |
| ; GFX11-NEXT: v_cmp_u_f32_e64 s4, s2, s2 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s2 |
| ; GFX11-NEXT: s_bitset1_b32 s2, 22 |
| ; GFX11-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX11-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s2, s2, s3 |
| ; GFX11-NEXT: s_bfe_u32 s3, s1, 0x10010 |
| ; GFX11-NEXT: v_cmp_u_f32_e64 s4, s1, s1 |
| ; GFX11-NEXT: s_add_i32 s3, s3, s1 |
| ; GFX11-NEXT: s_lshr_b32 s2, s2, 16 |
| ; GFX11-NEXT: s_bitset1_b32 s1, 22 |
| ; GFX11-NEXT: s_addk_i32 s3, 0x7fff |
| ; GFX11-NEXT: s_and_b32 s4, s4, exec_lo |
| ; GFX11-NEXT: s_cselect_b32 s1, s1, s3 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11-NEXT: ; return to shader part epilog |
| %sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat> |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign.trunc) |
| %cast = bitcast <2 x bfloat> %out to i32 |
| ret i32 %cast |
| } |
| |
| define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> inreg %mag, <2 x double> inreg %sign) { |
| ; GCN-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GCN-NEXT: s_and_b32 s0, s3, 0x80000000 |
| ; GCN-NEXT: s_and_b32 s1, s5, 0x80000000 |
| ; GCN-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: s_lshr_b32 s1, s1, 16 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v1, s0, v1 |
| ; GCN-NEXT: v_or_b32_e32 v0, s1, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GCN-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GCN-NEXT: ; return to shader part epilog |
| ; |
| ; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0 |
| ; GFX7-NEXT: s_and_b32 s0, s3, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1 |
| ; GFX7-NEXT: v_or_b32_e32 v1, s0, v1 |
| ; GFX7-NEXT: s_and_b32 s0, s5, 0x80000000 |
| ; GFX7-NEXT: s_lshr_b32 s0, s0, 16 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 |
| ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX7-NEXT: ; return to shader part epilog |
| ; |
| ; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_lshl_b32 s1, s4, 16 |
| ; GFX8-NEXT: s_and_b32 s2, s2, 0x8000 |
| ; GFX8-NEXT: s_or_b32 s1, s2, s1 |
| ; GFX8-NEXT: s_mov_b32 s2, 0x7fff7fff |
| ; GFX8-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX8-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX8-NEXT: ; return to shader part epilog |
| ; |
| ; GFX9-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s4 |
| ; GFX9-NEXT: s_mov_b32 s2, 0x7fff7fff |
| ; GFX9-NEXT: v_mov_b32_e32 v0, s0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, s1 |
| ; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 |
| ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX9-NEXT: ; return to shader part epilog |
| ; |
| ; GFX10-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s4 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX10-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11TRUE16-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_mov_b32 s1, s2 |
| ; GFX11TRUE16-NEXT: s_mov_b32 s2, s4 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) |
| ; GFX11TRUE16-NEXT: s_pack_ll_b32_b16 s1, s1, s2 |
| ; GFX11TRUE16-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11TRUE16-NEXT: ; return to shader part epilog |
| ; |
| ; GFX11FAKE16-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_pack_ll_b32_b16 s1, s2, s4 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s1 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 |
| ; GFX11FAKE16-NEXT: ; return to shader part epilog |
| %sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat> |
| %out = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> %mag, <2 x bfloat> %sign.trunc) |
| %cast = bitcast <2 x bfloat> %out to i32 |
| ret i32 %cast |
| } |
| |
| |
| define <3 x float> @v_copysign_out_v3f32_mag_v3bf16_sign_v3f32(<3 x bfloat> %mag, <3 x float> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v5 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v1 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v5, v4 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v5, v4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v6, 0x7fff0000, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v5, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v6, v3 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fffffff, v7, v4 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_out_v3f32_mag_v3bf16_sign_v3f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v6, 0x7fff0000, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v5, v2 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v6, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v7, v4 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %mag.ext = fpext <3 x bfloat> %mag to <3 x float> |
| %out = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag.ext, <3 x float> %sign) |
| ret <3 x float> %out |
| } |
| |
| define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3bf16(<3 x float> %mag, <3 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GCN-NEXT: v_bfi_b32 v2, s4, v2, v5 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v5 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v4 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v5 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v4 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v4 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v4 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.ext = fpext <3 x bfloat> %sign to <3 x float> |
| %out = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign.ext) |
| ret <3 x float> %out |
| } |
| |
| define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3bf16(<3 x double> %mag, <3 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v6 |
| ; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GCN-NEXT: v_bfi_b32 v5, s4, v5, v8 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v6 |
| ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v8 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v8 |
| ; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v7 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v6 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8 |
| ; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v7 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v6 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v6 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 |
| ; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.ext = fpext <3 x bfloat> %sign to <3 x double> |
| %out = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign.ext) |
| ret <3 x double> %out |
| } |
| |
| define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16(<3 x float> %mag, <3 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 |
| ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc |
| ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 |
| ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v0 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc |
| ; GFX8-NEXT: v_bfe_u32 v6, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 |
| ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v1 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
| ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v2, v4 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v2, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc |
| ; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc |
| ; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1 |
| ; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 |
| ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2 |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3 |
| ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v4 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v8, v2, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v4 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3f32_sign_v3bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v0, 16, 1 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v1, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v4 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %mag.trunc = fptrunc <3 x float> %mag to <3 x bfloat> |
| %out = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag.trunc, <3 x bfloat> %sign) |
| ret <3 x bfloat> %out |
| } |
| |
| define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16(<3 x double> %mag, <3 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[4:5] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[4:5] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] |
| ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v8 |
| ; GFX7-NEXT: v_bfe_u32 v3, v4, 16, 15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v7 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v6 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v10, v[4:5] |
| ; GFX8-NEXT: s_movk_i32 s8, 0x7fff |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 |
| ; GFX8-NEXT: v_and_b32_e32 v11, 1, v10 |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v11, v[0:1] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[8:9]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[8:9] |
| ; GFX8-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], v10, v8 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc |
| ; GFX8-NEXT: v_bfe_u32 v8, v10, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10 |
| ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s8, v8 |
| ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5] |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 |
| ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[8:9]| |
| ; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v10, vcc |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[8:9] |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 |
| ; GFX8-NEXT: v_and_b32_e32 v4, 1, v11 |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v9, v[2:3] |
| ; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v11, v4 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v11, vcc |
| ; GFX8-NEXT: v_bfe_u32 v4, v8, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v8 |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v9 |
| ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s8, v11 |
| ; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[4:5]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] |
| ; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 1, v9 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v0, v11, v8, s[4:5] |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], v9, v1 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc |
| ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 |
| ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] |
| ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v6 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v10, v7 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v10, v[4:5] |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v11, v[0:1] |
| ; GFX9-NEXT: s_movk_i32 s8, 0x7fff |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 |
| ; GFX9-NEXT: v_and_b32_e32 v12, 1, v10 |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[8:9]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[8:9] |
| ; GFX9-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v10, v8, v10, vcc |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[4:5] |
| ; GFX9-NEXT: v_bfe_u32 v12, v10, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v12, v12, v10, s8 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[8:9]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[8:9] |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v8, v[2:3] |
| ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 |
| ; GFX9-NEXT: v_and_b32_e32 v4, 1, v11 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[4:5] |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v4, v11, v4 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v9, v4, v11, vcc |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] |
| ; GFX9-NEXT: v_bfe_u32 v11, v9, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v11, v11, v9, s8 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[4:5]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] |
| ; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v9 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 1, v8 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5] |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v1, v8, v1 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc |
| ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] |
| ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s8 |
| ; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v1 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc |
| ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_alignbit_b32 v1, s4, v10, 16 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v6 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v7 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v14, v[4:5] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v15, v[0:1] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v16, v[2:3] |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v15 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v16 |
| ; GFX10-NEXT: v_and_b32_e32 v17, 1, v14 |
| ; GFX10-NEXT: v_and_b32_e32 v18, 1, v15 |
| ; GFX10-NEXT: v_and_b32_e32 v19, 1, v16 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v17 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v19 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s7, |v[4:5]|, |v[8:9]| |
| ; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[4:5], v[8:9] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, v[0:1], v[10:11] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s5, v[2:3], v[12:13] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v8, -1, 1, s7 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s7, |v[0:1]|, |v[10:11]| |
| ; GFX10-NEXT: s_or_b32 vcc_lo, vcc_lo, s6 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v8, v14, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc_lo |
| ; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v9, -1, 1, s7 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s7, |v[2:3]|, |v[12:13]| |
| ; GFX10-NEXT: v_bfe_u32 v12, v8, 16, 1 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v9, v15, v9 |
| ; GFX10-NEXT: v_add3_u32 v8, v12, v8, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e64 v10, -1, 1, s7 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s7, 1, v18 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v10, v16, v10 |
| ; GFX10-NEXT: s_or_b32 vcc_lo, s4, s7 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo |
| ; GFX10-NEXT: s_or_b32 vcc_lo, s5, s8 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX10-NEXT: v_bfe_u32 v11, v9, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v9 |
| ; GFX10-NEXT: v_bfe_u32 v13, v10, 16, 1 |
| ; GFX10-NEXT: v_add3_u32 v9, v11, v9, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v10 |
| ; GFX10-NEXT: v_add3_u32 v10, v13, v10, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v15, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] |
| ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc_lo |
| ; GFX10-NEXT: v_alignbit_b32 v1, s4, v2, 16 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v16, v[4:5] |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v14, v[0:1] |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v15, v[2:3] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v16 |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v15 |
| ; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v16 |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s4, |v[4:5]|, |v[12:13]| |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s2, |v[0:1]|, |v[8:9]| |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s3, |v[2:3]|, |v[10:11]| |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[8:9] |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s1, v[4:5], v[12:13] |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s0, v[2:3], v[10:11] |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v10, -1, 1, s4 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v8, -1, 1, s2 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v9, -1, 1, s3 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, v16, v10 |
| ; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v14 |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v8, v14, v8 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v9, v15, v9 |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v19 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s3 |
| ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_and_b32 v17, 1, v15 |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v17 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v11, v8, 16, 1 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s1, vcc_lo |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v8 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX11TRUE16-NEXT: v_add3_u32 v8, v11, v8, 0x7fff |
| ; GFX11TRUE16-NEXT: s_or_b32 s0, s0, s2 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v11, v10, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v10 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v12, v9, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v9 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_add3_u32 v9, v12, v9, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX11TRUE16-NEXT: v_add3_u32 v8, v11, v10, 0x7fff |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v14, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3f64_sign_v3bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v14, v[4:5] |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v15, v[0:1] |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v16, v[2:3] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v15 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v16 |
| ; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v15 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s3, |v[4:5]|, |v[8:9]| |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[4:5], v[8:9] |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e64 s0, v[0:1], v[10:11] |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e64 s1, v[2:3], v[12:13] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v8, -1, 1, s3 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s3, |v[0:1]|, |v[10:11]| |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v9, -1, 1, s3 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s3, |v[2:3]|, |v[12:13]| |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v9, v15, v9 |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v8, v14, v8 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v10, -1, 1, s3 |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v18 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, v16, v10 |
| ; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v14 |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v17 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s2 |
| ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v8, v8, v14 :: v_dual_and_b32 v19, 1, v16 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, s0, s3 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc_lo |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v19 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v11, v9, 16, 1 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, s1, s4 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v9 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v16, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX11FAKE16-NEXT: v_add3_u32 v9, v11, v9, 0x7fff |
| ; GFX11FAKE16-NEXT: v_add3_u32 v8, v12, v8, 0x7fff |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v13, v10, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v10 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v10, v13, v10, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v15, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v8, v14, vcc_lo |
| ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %mag.trunc = fptrunc <3 x double> %mag to <3 x bfloat> |
| %result = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag.trunc, <3 x bfloat> %sign) |
| ret <3 x bfloat> %result |
| } |
| |
| define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %mag, <3 x float> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 |
| ; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc |
| ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc |
| ; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3 |
| ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc |
| ; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v2, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc |
| ; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: v_alignbit_b32 v2, s4, v4, 16 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1 |
| ; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 |
| ; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v3 |
| ; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4 |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 |
| ; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x7060302 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX10-NEXT: v_alignbit_b32 v3, s4, v4, 16 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v8, v4, 16, 1 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v8, v4, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc_lo |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v3 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v5, v2, 16, 1 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v9, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v2, v3, v2, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_alignbit_b32 v3, s0, v4, 16 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.trunc = fptrunc <3 x float> %sign to <3 x bfloat> |
| %out = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign.trunc) |
| ret <3 x bfloat> %out |
| } |
| |
| define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %mag, <3 x double> %sign) { |
| ; GCN-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v6 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v8 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v5 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x80000000, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v7 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v0, v2 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v5 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v7 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v0, v2 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v5 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 |
| ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v5 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v3, v4 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v7 |
| ; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l |
| ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v0, v5 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v4 |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v7 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.trunc = fptrunc <3 x double> %sign to <3 x bfloat> |
| %out = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign.trunc) |
| ret <3 x bfloat> %out |
| } |
| |
| define <4 x float> @v_copysign_out_v4f32_mag_v4bf16_sign_v4f32(<4 x bfloat> %mag, <4 x float> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 |
| ; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 |
| ; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GCN-NEXT: v_bfi_b32 v2, s4, v2, v6 |
| ; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v6 |
| ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_and_b32_e32 v6, 0x7fff0000, v1 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v1 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v7, v4 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v6, v5 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_and_b32_e32 v6, 0x7fff0000, v1 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff0000, v0 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v7, v4 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v6, v5 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v0 |
| ; GFX10-NEXT: v_and_b32_e32 v7, 0x7fff0000, v0 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1 |
| ; GFX10-NEXT: v_and_b32_e32 v9, 0x7fff0000, v1 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v6, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v7, v3 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fffffff, v8, v4 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v9, v5 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11-LABEL: v_copysign_out_v4f32_mag_v4bf16_sign_v4f32: |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v0 |
| ; GFX11-NEXT: v_and_b32_e32 v7, 0x7fff0000, v0 |
| ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v1 |
| ; GFX11-NEXT: v_and_b32_e32 v9, 0x7fff0000, v1 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v6, v2 |
| ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v7, v3 |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v8, v4 |
| ; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v9, v5 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| %mag.ext = fpext <4 x bfloat> %mag to <4 x float> |
| %out = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag.ext, <4 x float> %sign) |
| ret <4 x float> %out |
| } |
| |
| define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4bf16(<4 x float> %mag, <4 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GCN-NEXT: v_bfi_b32 v2, s4, v2, v6 |
| ; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX7-NEXT: v_bfi_b32 v2, s4, v2, v6 |
| ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v4 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v5 |
| ; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v4 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v7 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v4 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 |
| ; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v4 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v7 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 |
| ; GFX10-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4 |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.ext = fpext <4 x bfloat> %sign to <4 x float> |
| %out = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign.ext) |
| ret <4 x float> %out |
| } |
| |
| define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4bf16(<4 x double> %mag, <4 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_brev_b32 s4, -2 |
| ; GCN-NEXT: v_bfi_b32 v1, s4, v1, v8 |
| ; GCN-NEXT: v_bfi_b32 v3, s4, v3, v9 |
| ; GCN-NEXT: v_bfi_b32 v5, s4, v5, v10 |
| ; GCN-NEXT: v_bfi_b32 v7, s4, v7, v11 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: s_brev_b32 s4, -2 |
| ; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v8 |
| ; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v9 |
| ; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v10 |
| ; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v11 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX8-NEXT: s_brev_b32 s4, -2 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v8 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v9 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v10 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v10 |
| ; GFX8-NEXT: v_bfi_b32 v7, s4, v7, v8 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v8 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX9-NEXT: s_brev_b32 s4, -2 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v8 |
| ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9 |
| ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v10 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 |
| ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v10 |
| ; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v8 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8 |
| ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 |
| ; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9 |
| ; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v10 |
| ; GFX10-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v11 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v10 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8 |
| ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 |
| ; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v10 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v11 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.ext = fpext <4 x bfloat> %sign to <4 x double> |
| %out = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign.ext) |
| ret <4 x double> %out |
| } |
| |
| define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16(<4 x float> %mag, <4 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc |
| ; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v6, vcc |
| ; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v0 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc |
| ; GFX8-NEXT: v_bfe_u32 v7, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v1 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v4 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc |
| ; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v3, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc |
| ; GFX9-NEXT: v_bfe_u32 v6, v0, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v0, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v0 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc |
| ; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v1, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff |
| ; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 |
| ; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v4 |
| ; GFX9-NEXT: v_bfi_b32 v1, s5, v1, v5 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX10-NEXT: v_bfe_u32 v8, v0, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX10-NEXT: v_bfe_u32 v10, v1, 16, 1 |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v2, 0x7fff |
| ; GFX10-NEXT: v_add3_u32 v8, v8, v0, 0x7fff |
| ; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v3 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo |
| ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX10-NEXT: v_add3_u32 v9, v10, v1, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1 |
| ; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v6, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc_lo |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v10, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v11, v1, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4f32_sign_v4bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v8, v0, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v2, 0x7fff |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v10, v1, 16, 1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc_lo |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v9, v10, v1, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v6, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc_lo |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %mag.trunc = fptrunc <4 x float> %mag to <4 x bfloat> |
| %out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %mag.trunc, <4 x bfloat> %sign) |
| ret <4 x bfloat> %out |
| } |
| |
| define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16(<4 x double> %mag, <4 x bfloat> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[4:5] |
| ; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[6:7] |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v8 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v10 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v11 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[6:7] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] |
| ; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[4:5] |
| ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 |
| ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v11 |
| ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v10 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v11 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v9 |
| ; GFX7-NEXT: v_or_b32_e32 v4, v1, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v10 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v8 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v9 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v8 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v12, v[4:5] |
| ; GFX8-NEXT: s_movk_i32 s8, 0x7fff |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v12 |
| ; GFX8-NEXT: v_and_b32_e32 v13, 1, v12 |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v13 |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v13, v[6:7] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[10:11]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[10:11] |
| ; GFX8-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], v12, v10 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v12, v10, v12, vcc |
| ; GFX8-NEXT: v_bfe_u32 v10, v12, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v14, vcc, v10, v12 |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 |
| ; GFX8-NEXT: v_add_u32_e32 v14, vcc, s8, v14 |
| ; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[4:5] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, |v[10:11]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[6:7], v[10:11] |
| ; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v12 |
| ; GFX8-NEXT: v_and_b32_e32 v4, 1, v13 |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v11, v[0:1] |
| ; GFX8-NEXT: v_cndmask_b32_e64 v12, v14, v12, s[4:5] |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v13, v4 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v13, vcc |
| ; GFX8-NEXT: v_bfe_u32 v4, v10, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v10 |
| ; GFX8-NEXT: v_add_u32_e32 v13, vcc, s8, v4 |
| ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7] |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v11 |
| ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 |
| ; GFX8-NEXT: v_and_b32_e32 v7, 1, v11 |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| |
| ; GFX8-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] |
| ; GFX8-NEXT: v_cvt_f32_f64_e32 v10, v[2:3] |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v11, v4 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v11, vcc |
| ; GFX8-NEXT: v_bfe_u32 v4, v7, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v11, vcc, v4, v7 |
| ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 |
| ; GFX8-NEXT: v_add_u32_e32 v11, vcc, s8, v11 |
| ; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] |
| ; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[4:5]| |
| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] |
| ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 |
| ; GFX8-NEXT: v_and_b32_e32 v1, 1, v10 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v0, v11, v7, s[4:5] |
| ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 |
| ; GFX8-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] |
| ; GFX8-NEXT: v_add_u32_e64 v1, s[6:7], v10, v1 |
| ; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc |
| ; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1 |
| ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 |
| ; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] |
| ; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 |
| ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 |
| ; GFX8-NEXT: v_alignbit_b32 v1, v6, v12, 16 |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v8 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v9 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v12, v[4:5] |
| ; GFX9-NEXT: s_movk_i32 s8, 0x7fff |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v12 |
| ; GFX9-NEXT: v_and_b32_e32 v13, 1, v12 |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v13 |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v13, v[6:7] |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[10:11]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[10:11] |
| ; GFX9-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v10, v12, v10 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v12, v10, v12, vcc |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v13 |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[4:5] |
| ; GFX9-NEXT: v_bfe_u32 v14, v12, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v14, v14, v12, s8 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, |v[10:11]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[6:7], v[10:11] |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v10, v[0:1] |
| ; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v12 |
| ; GFX9-NEXT: v_and_b32_e32 v4, 1, v13 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v12, v14, v12, s[4:5] |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v4, v13, v4 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v11, v4, v13, vcc |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[6:7], v[6:7] |
| ; GFX9-NEXT: v_bfe_u32 v13, v11, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v13, v13, v11, s8 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] |
| ; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v11 |
| ; GFX9-NEXT: v_and_b32_e32 v7, 1, v10 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v6, v13, v11, s[4:5] |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 |
| ; GFX9-NEXT: v_cvt_f32_f64_e32 v7, v[2:3] |
| ; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v4, v10, v4 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v10, vcc |
| ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 |
| ; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[0:1] |
| ; GFX9-NEXT: v_bfe_u32 v11, v10, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v11, v11, v10, s8 |
| ; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[4:5]| |
| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] |
| ; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v10 |
| ; GFX9-NEXT: v_and_b32_e32 v1, 1, v7 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v0, v11, v10, s[4:5] |
| ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 |
| ; GFX9-NEXT: v_cndmask_b32_e64 v1, -1, 1, s[6:7] |
| ; GFX9-NEXT: v_add_u32_e32 v1, v7, v1 |
| ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc |
| ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] |
| ; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v4, v4, v1, s8 |
| ; GFX9-NEXT: v_or_b32_e32 v1, 0x400000, v1 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff |
| ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc |
| ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 |
| ; GFX9-NEXT: v_perm_b32 v1, v6, v12, s4 |
| ; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v8 |
| ; GFX9-NEXT: v_bfi_b32 v1, s5, v1, v9 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v18, v[4:5] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v19, v[6:7] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v20, v[0:1] |
| ; GFX10-NEXT: v_cvt_f32_f64_e32 v21, v[2:3] |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v18 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[12:13], v19 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[14:15], v20 |
| ; GFX10-NEXT: v_cvt_f64_f32_e32 v[16:17], v21 |
| ; GFX10-NEXT: v_and_b32_e32 v22, 1, v18 |
| ; GFX10-NEXT: v_and_b32_e32 v23, 1, v19 |
| ; GFX10-NEXT: v_and_b32_e32 v24, 1, v20 |
| ; GFX10-NEXT: v_and_b32_e32 v25, 1, v21 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v22 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s8, 1, v23 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s9, 1, v24 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s10, |v[4:5]|, |v[10:11]| |
| ; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[4:5], v[10:11] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, v[6:7], v[12:13] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s5, v[0:1], v[14:15] |
| ; GFX10-NEXT: v_cmp_nlg_f64_e64 s7, v[2:3], v[16:17] |
| ; GFX10-NEXT: v_cndmask_b32_e64 v10, -1, 1, s10 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s10, |v[6:7]|, |v[12:13]| |
| ; GFX10-NEXT: s_or_b32 vcc_lo, vcc_lo, s6 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v10, v18, v10 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v18, vcc_lo |
| ; GFX10-NEXT: s_or_b32 vcc_lo, s4, s8 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v11, -1, 1, s10 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s10, |v[0:1]|, |v[14:15]| |
| ; GFX10-NEXT: v_bfe_u32 v14, v10, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v10 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v11, v19, v11 |
| ; GFX10-NEXT: v_add3_u32 v10, v14, v10, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo |
| ; GFX10-NEXT: s_or_b32 vcc_lo, s5, s9 |
| ; GFX10-NEXT: v_cndmask_b32_e64 v12, -1, 1, s10 |
| ; GFX10-NEXT: v_cmp_gt_f64_e64 s10, |v[2:3]|, |v[16:17]| |
| ; GFX10-NEXT: v_bfe_u32 v16, v11, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v11 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v12, v20, v12 |
| ; GFX10-NEXT: v_add3_u32 v11, v16, v11, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v20, vcc_lo |
| ; GFX10-NEXT: v_bfe_u32 v18, v12, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v12 |
| ; GFX10-NEXT: v_add3_u32 v12, v18, v12, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e64 v13, -1, 1, s10 |
| ; GFX10-NEXT: v_cmp_eq_u32_e64 s10, 1, v25 |
| ; GFX10-NEXT: v_add_nc_u32_e32 v13, v21, v13 |
| ; GFX10-NEXT: s_or_b32 vcc_lo, s7, s10 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] |
| ; GFX10-NEXT: v_bfe_u32 v20, v13, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v13 |
| ; GFX10-NEXT: v_add3_u32 v13, v20, v13, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v15, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v19, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v21, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[6:7], v[6:7] |
| ; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v17, vcc_lo |
| ; GFX10-NEXT: v_perm_b32 v1, v2, v4, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v18, v[6:7] |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v19, v[4:5] |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v20, v[2:3] |
| ; GFX11TRUE16-NEXT: v_cvt_f32_f64_e32 v21, v[0:1] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v18 |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v19 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[14:15], v20 |
| ; GFX11TRUE16-NEXT: v_cvt_f64_f32_e32 v[16:17], v21 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s6, |v[6:7]|, |v[10:11]| |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[6:7], v[10:11] |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s0, v[4:5], v[12:13] |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s1, v[2:3], v[14:15] |
| ; GFX11TRUE16-NEXT: v_cmp_nlg_f64_e64 s2, v[0:1], v[16:17] |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v10, -1, 1, s6 |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s6, |v[4:5]|, |v[12:13]| |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v10, v18, v10 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v11, -1, 1, s6 |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s6, |v[2:3]|, |v[14:15]| |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v11, v19, v11 |
| ; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v18 |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v22 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s3 |
| ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v10, v10, v18 :: v_dual_and_b32 v23, 1, v19 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v23 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s0, s4 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_add3_u32 v10, v14, v10, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v12, -1, 1, s6 |
| ; GFX11TRUE16-NEXT: v_cmp_gt_f64_e64 s6, |v[0:1]|, |v[16:17]| |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v16, v11, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v11 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v12, v20, v12 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v11, v16, v11, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v13, -1, 1, s6 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_add_nc_u32_e32 v13, v21, v13 |
| ; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v20 |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v24 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s1, s5 |
| ; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v12, v12, v20 :: v_dual_and_b32 v25, 1, v21 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v25 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v18, v12, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v12 |
| ; GFX11TRUE16-NEXT: s_or_b32 vcc_lo, s2, s6 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_add3_u32 v12, v18, v12, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v19, v13, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v13 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) |
| ; GFX11TRUE16-NEXT: v_add3_u32 v13, v19, v13, 0x7fff |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v11, v17, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v13, v21, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[6:7], v[6:7] |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v12, v20, vcc_lo |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4f64_sign_v4bf16: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v18, v[4:5] |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v19, v[6:7] |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v20, v[0:1] |
| ; GFX11FAKE16-NEXT: v_cvt_f32_f64_e32 v21, v[2:3] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v18 |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[12:13], v19 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[14:15], v20 |
| ; GFX11FAKE16-NEXT: v_cvt_f64_f32_e32 v[16:17], v21 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s6, |v[4:5]|, |v[10:11]| |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[4:5], v[10:11] |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e64 s0, v[6:7], v[12:13] |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e64 s1, v[0:1], v[14:15] |
| ; GFX11FAKE16-NEXT: v_cmp_nlg_f64_e64 s2, v[2:3], v[16:17] |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v10, -1, 1, s6 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s6, |v[6:7]|, |v[12:13]| |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v10, v18, v10 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v11, -1, 1, s6 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s6, |v[0:1]|, |v[14:15]| |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v11, v19, v11 |
| ; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v18 |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v22 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, vcc_lo, s3 |
| ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v10, v10, v18 :: v_dual_and_b32 v23, 1, v19 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v23 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v14, v10, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v10 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, s0, s4 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_add3_u32 v10, v14, v10, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v12, -1, 1, s6 |
| ; GFX11FAKE16-NEXT: v_cmp_gt_f64_e64 s6, |v[2:3]|, |v[16:17]| |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v11 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v12, v20, v12 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v11, v16, v11, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v13, -1, 1, s6 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_add_nc_u32_e32 v13, v21, v13 |
| ; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v20 |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v24 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, s1, s5 |
| ; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v12, v12, v20 :: v_dual_and_b32 v25, 1, v21 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v25 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v18, v12, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v12 |
| ; GFX11FAKE16-NEXT: s_or_b32 vcc_lo, s2, s6 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_add3_u32 v12, v18, v12, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v13, v21, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[4:5], v[4:5] |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v20, v13, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v13 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_add3_u32 v13, v20, v13, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v15, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v12, v19, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[2:3] |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v13, v21, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f64_e32 vcc_lo, v[6:7], v[6:7] |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v17, vcc_lo |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v1, v2, v4, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %mag.trunc = fptrunc <4 x double> %mag to <4 x bfloat> |
| %result = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %mag.trunc, <4 x bfloat> %sign) |
| ret <4 x bfloat> %result |
| } |
| |
| define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %mag, <4 x float> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 |
| ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 |
| ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 |
| ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GFX7-NEXT: v_and_b32_e32 v7, 0x8000, v7 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v6, 0x8000, v6 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v5, 0x8000, v5 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x8000, v4 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc |
| ; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1 |
| ; GFX8-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2 |
| ; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc |
| ; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc |
| ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3 |
| ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 |
| ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 |
| ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
| ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GFX8-NEXT: v_alignbit_b32 v2, v3, v2, 16 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_alignbit_b32 v2, v5, v4, 16 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7fff |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc |
| ; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v5, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc |
| ; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc |
| ; GFX9-NEXT: v_bfe_u32 v6, v3, 16, 1 |
| ; GFX9-NEXT: v_add3_u32 v6, v6, v3, s4 |
| ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 |
| ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 |
| ; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc |
| ; GFX9-NEXT: s_mov_b32 s4, 0x7060302 |
| ; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v2 |
| ; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 |
| ; GFX9-NEXT: v_bfi_b32 v1, s5, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_bfe_u32 v6, v4, 16, 1 |
| ; GFX10-NEXT: v_bfe_u32 v8, v2, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v4 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 |
| ; GFX10-NEXT: v_bfe_u32 v10, v3, 16, 1 |
| ; GFX10-NEXT: v_add3_u32 v6, v6, v4, 0x7fff |
| ; GFX10-NEXT: v_add3_u32 v8, v8, v2, 0x7fff |
| ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1 |
| ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v5 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc_lo |
| ; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX10-NEXT: v_add3_u32 v9, v10, v3, 0x7fff |
| ; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v3 |
| ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff |
| ; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v6, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo |
| ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 |
| ; GFX10-NEXT: v_perm_b32 v2, v3, v2, 0x7060302 |
| ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc_lo |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX10-NEXT: v_perm_b32 v3, v5, v4, 0x7060302 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1 |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v4 |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff |
| ; GFX11TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11TRUE16-NEXT: v_add3_u32 v7, v11, v3, 0x7fff |
| ; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v12, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo |
| ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h |
| ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v10, vcc_lo |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v3 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v4, v5 |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v6, v4, 16, 1 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v8, v2, 16, 1 |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v7, v5, 16, 1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff |
| ; GFX11FAKE16-NEXT: v_bfe_u32 v10, v3, 16, 1 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v6, v9, vcc_lo |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 |
| ; GFX11FAKE16-NEXT: v_add3_u32 v9, v10, v3, 0x7fff |
| ; GFX11FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v8, v6, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo |
| ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 |
| ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc_lo |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v2, v3, v2, 0x7060302 |
| ; GFX11FAKE16-NEXT: v_perm_b32 v3, v5, v4, 0x7060302 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.trunc = fptrunc <4 x float> %sign to <4 x bfloat> |
| %out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign.trunc) |
| ret <4 x bfloat> %out |
| } |
| |
| define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %mag, <4 x double> %sign) { |
| ; GCN-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GCN-NEXT: v_and_b32_e32 v4, 0x80000000, v7 |
| ; GCN-NEXT: v_and_b32_e32 v6, 0x80000000, v11 |
| ; GCN-NEXT: v_and_b32_e32 v7, 0x80000000, v9 |
| ; GCN-NEXT: v_and_b32_e32 v5, 0x80000000, v5 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 |
| ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 |
| ; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
| ; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GCN-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GCN-NEXT: v_or_b32_e32 v3, v3, v6 |
| ; GCN-NEXT: v_or_b32_e32 v2, v2, v7 |
| ; GCN-NEXT: v_or_b32_e32 v0, v0, v5 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GFX7: ; %bb.0: |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v7 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 |
| ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v11 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 |
| ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v9 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15 |
| ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 |
| ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 |
| ; GFX7-NEXT: v_and_b32_e32 v4, 0x80000000, v5 |
| ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
| ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15 |
| ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 |
| ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GFX8: ; %bb.0: |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v5 |
| ; GFX8-NEXT: v_and_b32_e32 v3, 0x8000, v3 |
| ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff |
| ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 |
| ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v9 |
| ; GFX8-NEXT: v_and_b32_e32 v3, 0x8000, v7 |
| ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 |
| ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-NEXT: s_mov_b32 s4, 0x5040100 |
| ; GFX9-NEXT: v_perm_b32 v2, v5, v3, s4 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff |
| ; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v2 |
| ; GFX9-NEXT: v_perm_b32 v2, v9, v7, s4 |
| ; GFX9-NEXT: v_bfi_b32 v1, s5, v1, v2 |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX10-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: v_perm_b32 v2, v5, v3, 0x5040100 |
| ; GFX10-NEXT: v_perm_b32 v3, v9, v7, 0x5040100 |
| ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11TRUE16-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GFX11TRUE16: ; %bb.0: |
| ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v5.l |
| ; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v9.l |
| ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3 |
| ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7 |
| ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX11FAKE16-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64: |
| ; GFX11FAKE16: ; %bb.0: |
| ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v3, 0x5040100 |
| ; GFX11FAKE16-NEXT: v_perm_b32 v3, v9, v7, 0x5040100 |
| ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 |
| ; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 |
| ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] |
| %sign.trunc = fptrunc <4 x double> %sign to <4 x bfloat> |
| %out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign.trunc) |
| ret <4 x bfloat> %out |
| } |
| |
| attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } |