blob: 5dabb8cc633cc72409b15b34c7bd941934e591d8 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16,+avx512vl | FileCheck %s --check-prefixes=CHECK,BF16
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2 | FileCheck %s --check-prefixes=CHECK,AVX10_2
;
; Signed Integer to BFloat
;
define bfloat @sitofp_i32_to_bf16(i32 %a) {
; CHECK-LABEL: sitofp_i32_to_bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0
; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; CHECK-NEXT: retq
%cvt = sitofp i32 %a to bfloat
ret bfloat %cvt
}
define bfloat @sitofp_i64_to_bf16(i64 %a) {
; CHECK-LABEL: sitofp_i64_to_bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0
; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; CHECK-NEXT: retq
%cvt = sitofp i64 %a to bfloat
ret bfloat %cvt
}
define <8 x bfloat> @sitofp_v8i32_to_v8bf16(<8 x i32> %a) {
; CHECK-LABEL: sitofp_v8i32_to_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%cvt = sitofp <8 x i32> %a to <8 x bfloat>
ret <8 x bfloat> %cvt
}
define <4 x bfloat> @sitofp_v4i32_to_v4bf16(<4 x i32> %a) {
; CHECK-LABEL: sitofp_v4i32_to_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x bfloat>
ret <4 x bfloat> %cvt
}
define <2 x bfloat> @sitofp_v2i32_to_v2bf16(<2 x i32> %a) {
; CHECK-LABEL: sitofp_v2i32_to_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%cvt = sitofp <2 x i32> %a to <2 x bfloat>
ret <2 x bfloat> %cvt
}
define <16 x bfloat> @sitofp_v16i32_to_v16bf16(<16 x i32> %a) {
; CHECK-LABEL: sitofp_v16i32_to_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
; CHECK-NEXT: vcvtneps2bf16 %zmm0, %ymm0
; CHECK-NEXT: retq
%cvt = sitofp <16 x i32> %a to <16 x bfloat>
ret <16 x bfloat> %cvt
}
define <2 x bfloat> @sitofp_v2i64_to_v2bf16(<2 x i64> %a) {
; BF16-LABEL: sitofp_v2i64_to_v2bf16:
; BF16: # %bb.0:
; BF16-NEXT: vpextrq $1, %xmm0, %rax
; BF16-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %eax
; BF16-NEXT: vmovq %xmm0, %rcx
; BF16-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm0
; BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; BF16-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BF16-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: sitofp_v2i64_to_v2bf16:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX10_2-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX10_2-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = sitofp <2 x i64> %a to <2 x bfloat>
ret <2 x bfloat> %cvt
}
define <4 x bfloat> @sitofp_v4i64_to_v4bf16(<4 x i64> %a) {
; BF16-LABEL: sitofp_v4i64_to_v4bf16:
; BF16: # %bb.0:
; BF16-NEXT: vextracti128 $1, %ymm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rax
; BF16-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %eax
; BF16-NEXT: vmovq %xmm1, %rcx
; BF16-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %ecx
; BF16-NEXT: vpextrq $1, %xmm0, %rdx
; BF16-NEXT: vcvtsi2ss %rdx, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %edx
; BF16-NEXT: vmovq %xmm0, %rsi
; BF16-NEXT: vcvtsi2ss %rsi, %xmm15, %xmm0
; BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; BF16-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BF16-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; BF16-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
; BF16-NEXT: vzeroupper
; BF16-NEXT: retq
;
; AVX10_2-LABEL: sitofp_v4i64_to_v4bf16:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX10_2-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX10_2-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = sitofp <4 x i64> %a to <4 x bfloat>
ret <4 x bfloat> %cvt
}
define <8 x bfloat> @sitofp_v8i64_to_v8bf16(<8 x i64> %a) {
; BF16-LABEL: sitofp_v8i64_to_v8bf16:
; BF16: # %bb.0:
; BF16-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rax
; BF16-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %eax
; BF16-NEXT: vmovq %xmm1, %rcx
; BF16-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %ecx
; BF16-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rdx
; BF16-NEXT: vcvtsi2ss %rdx, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %edx
; BF16-NEXT: vmovq %xmm1, %rsi
; BF16-NEXT: vcvtsi2ss %rsi, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %esi
; BF16-NEXT: vextracti128 $1, %ymm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rdi
; BF16-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %edi
; BF16-NEXT: vmovq %xmm1, %r8
; BF16-NEXT: vcvtsi2ss %r8, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %r8d
; BF16-NEXT: vpextrq $1, %xmm0, %r9
; BF16-NEXT: vcvtsi2ss %r9, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovq %xmm0, %r9
; BF16-NEXT: vcvtsi2ss %r9, %xmm15, %xmm0
; BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BF16-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; BF16-NEXT: vzeroupper
; BF16-NEXT: retq
;
; AVX10_2-LABEL: sitofp_v8i64_to_v8bf16:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX10_2-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = sitofp <8 x i64> %a to <8 x bfloat>
ret <8 x bfloat> %cvt
}
;
; Unsigned Integer to BFloat
;
define bfloat @uitofp_i32_to_bf16(i32 %a) {
; CHECK-LABEL: uitofp_i32_to_bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0
; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; CHECK-NEXT: retq
%cvt = uitofp i32 %a to bfloat
ret bfloat %cvt
}
define bfloat @uitofp_i64_to_bf16(i64 %a) {
; CHECK-LABEL: uitofp_i64_to_bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0
; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; CHECK-NEXT: retq
%cvt = uitofp i64 %a to bfloat
ret bfloat %cvt
}
define <8 x bfloat> @uitofp_v8i32_to_v8bf16(<8 x i32> %a) {
; CHECK-LABEL: uitofp_v8i32_to_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%cvt = uitofp <8 x i32> %a to <8 x bfloat>
ret <8 x bfloat> %cvt
}
define <4 x bfloat> @uitofp_v4i32_to_v4bf16(<4 x i32> %a) {
; CHECK-LABEL: uitofp_v4i32_to_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x bfloat>
ret <4 x bfloat> %cvt
}
define <2 x bfloat> @uitofp_v2i32_to_v2bf16(<2 x i32> %a) {
; CHECK-LABEL: uitofp_v2i32_to_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0
; CHECK-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%cvt = uitofp <2 x i32> %a to <2 x bfloat>
ret <2 x bfloat> %cvt
}
define <16 x bfloat> @uitofp_v16i32_to_v16bf16(<16 x i32> %a) {
; CHECK-LABEL: uitofp_v16i32_to_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtudq2ps %zmm0, %zmm0
; CHECK-NEXT: vcvtneps2bf16 %zmm0, %ymm0
; CHECK-NEXT: retq
%cvt = uitofp <16 x i32> %a to <16 x bfloat>
ret <16 x bfloat> %cvt
}
define <2 x bfloat> @uitofp_v2i64_to_v2bf16(<2 x i64> %a) {
; BF16-LABEL: uitofp_v2i64_to_v2bf16:
; BF16: # %bb.0:
; BF16-NEXT: vpextrq $1, %xmm0, %rax
; BF16-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %eax
; BF16-NEXT: vmovq %xmm0, %rcx
; BF16-NEXT: vcvtusi2ss %rcx, %xmm15, %xmm0
; BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; BF16-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BF16-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: uitofp_v2i64_to_v2bf16:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX10_2-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX10_2-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = uitofp <2 x i64> %a to <2 x bfloat>
ret <2 x bfloat> %cvt
}
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(<4 x i64> %a) {
; BF16-LABEL: uitofp_v4i64_to_v4bf16:
; BF16: # %bb.0:
; BF16-NEXT: vextracti128 $1, %ymm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rax
; BF16-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %eax
; BF16-NEXT: vmovq %xmm1, %rcx
; BF16-NEXT: vcvtusi2ss %rcx, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %ecx
; BF16-NEXT: vpextrq $1, %xmm0, %rdx
; BF16-NEXT: vcvtusi2ss %rdx, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %edx
; BF16-NEXT: vmovq %xmm0, %rsi
; BF16-NEXT: vcvtusi2ss %rsi, %xmm15, %xmm0
; BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; BF16-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BF16-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; BF16-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
; BF16-NEXT: vzeroupper
; BF16-NEXT: retq
;
; AVX10_2-LABEL: uitofp_v4i64_to_v4bf16:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX10_2-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX10_2-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = uitofp <4 x i64> %a to <4 x bfloat>
ret <4 x bfloat> %cvt
}
define <8 x bfloat> @uitofp_v8i64_to_v8bf16(<8 x i64> %a) {
; BF16-LABEL: uitofp_v8i64_to_v8bf16:
; BF16: # %bb.0:
; BF16-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rax
; BF16-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %eax
; BF16-NEXT: vmovq %xmm1, %rcx
; BF16-NEXT: vcvtusi2ss %rcx, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %ecx
; BF16-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rdx
; BF16-NEXT: vcvtusi2ss %rdx, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %edx
; BF16-NEXT: vmovq %xmm1, %rsi
; BF16-NEXT: vcvtusi2ss %rsi, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %esi
; BF16-NEXT: vextracti128 $1, %ymm0, %xmm1
; BF16-NEXT: vpextrq $1, %xmm1, %rdi
; BF16-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm2
; BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; BF16-NEXT: vmovd %xmm2, %edi
; BF16-NEXT: vmovq %xmm1, %r8
; BF16-NEXT: vcvtusi2ss %r8, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovd %xmm1, %r8d
; BF16-NEXT: vpextrq $1, %xmm0, %r9
; BF16-NEXT: vcvtusi2ss %r9, %xmm15, %xmm1
; BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; BF16-NEXT: vmovq %xmm0, %r9
; BF16-NEXT: vcvtusi2ss %r9, %xmm15, %xmm0
; BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; BF16-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; BF16-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; BF16-NEXT: vzeroupper
; BF16-NEXT: retq
;
; AVX10_2-LABEL: uitofp_v8i64_to_v8bf16:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX10_2-NEXT: vcvtneps2bf16 %ymm0, %xmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = uitofp <8 x i64> %a to <8 x bfloat>
ret <8 x bfloat> %cvt
}
;
; BFloat to Signed Integer
;
define i32 @fptosi_bf16_to_i32(bfloat %a) {
; BF16-LABEL: fptosi_bf16_to_i32:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $0, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2si %xmm0, %eax
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptosi_bf16_to_i32:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vmovw %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm0
; AVX10_2-NEXT: vcvttss2si %xmm0, %eax
; AVX10_2-NEXT: retq
%cvt = fptosi bfloat %a to i32
ret i32 %cvt
}
define i64 @fptosi_bf16_to_i64(bfloat %a) {
; BF16-LABEL: fptosi_bf16_to_i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $0, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2si %xmm0, %rax
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptosi_bf16_to_i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vmovw %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm0
; AVX10_2-NEXT: vcvttss2si %xmm0, %rax
; AVX10_2-NEXT: retq
%cvt = fptosi bfloat %a to i64
ret i64 %cvt
}
define <8 x i32> @fptosi_v8bf16_to_v8i32(<8 x bfloat> %a) {
; CHECK-LABEL: fptosi_v8bf16_to_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpslld $16, %ymm0, %ymm0
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: retq
%cvt = fptosi <8 x bfloat> %a to <8 x i32>
ret <8 x i32> %cvt
}
define <4 x i32> @fptosi_v4bf16_to_v4i32(<4 x bfloat> %a) {
; CHECK-LABEL: fptosi_v4bf16_to_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: retq
%cvt = fptosi <4 x bfloat> %a to <4 x i32>
ret <4 x i32> %cvt
}
define <2 x i32> @fptosi_v2bf16_to_v2i32(<2 x bfloat> %a) {
; BF16-LABEL: fptosi_v2bf16_to_v2i32:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; BF16-NEXT: vcvttps2dq %xmm0, %xmm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptosi_v2bf16_to_v2i32:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vmovw %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm1
; AVX10_2-NEXT: vpextrw $1, %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm0
; AVX10_2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX10_2-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX10_2-NEXT: retq
%cvt = fptosi <2 x bfloat> %a to <2 x i32>
ret <2 x i32> %cvt
}
define <16 x i32> @fptosi_v16bf16_to_v16i32(<16 x bfloat> %a) {
; CHECK-LABEL: fptosi_v16bf16_to_v16i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; CHECK-NEXT: vpslld $16, %zmm0, %zmm0
; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0
; CHECK-NEXT: retq
%cvt = fptosi <16 x bfloat> %a to <16 x i32>
ret <16 x i32> %cvt
}
define <2 x i64> @fptosi_v2bf16_to_v2i64(<2 x bfloat> %a) {
; BF16-LABEL: fptosi_v2bf16_to_v2i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vcvttss2si %xmm1, %rax
; BF16-NEXT: vmovq %rax, %xmm1
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2si %xmm0, %rax
; BF16-NEXT: vmovq %rax, %xmm0
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptosi_v2bf16_to_v2i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX10_2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX10_2-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX10_2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = fptosi <2 x bfloat> %a to <2 x i64>
ret <2 x i64> %cvt
}
define <4 x i64> @fptosi_v4bf16_to_v4i64(<4 x bfloat> %a) {
; BF16-LABEL: fptosi_v4bf16_to_v4i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $3, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vcvttss2si %xmm1, %rax
; BF16-NEXT: vmovq %rax, %xmm1
; BF16-NEXT: vpextrw $2, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2si %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2si %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2si %xmm0, %rax
; BF16-NEXT: vmovq %rax, %xmm0
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; BF16-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptosi_v4bf16_to_v4i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX10_2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX10_2-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX10_2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX10_2-NEXT: retq
%cvt = fptosi <4 x bfloat> %a to <4 x i64>
ret <4 x i64> %cvt
}
define <8 x i64> @fptosi_v8bf16_to_v8i64(<8 x bfloat> %a) {
; BF16-LABEL: fptosi_v8bf16_to_v8i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $7, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vcvttss2si %xmm1, %rax
; BF16-NEXT: vmovq %rax, %xmm1
; BF16-NEXT: vpextrw $6, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2si %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; BF16-NEXT: vpextrw $5, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2si %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpextrw $4, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm3
; BF16-NEXT: vcvttss2si %xmm3, %rax
; BF16-NEXT: vmovq %rax, %xmm3
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; BF16-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; BF16-NEXT: vpextrw $3, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2si %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpextrw $2, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm3
; BF16-NEXT: vcvttss2si %xmm3, %rax
; BF16-NEXT: vmovq %rax, %xmm3
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm3
; BF16-NEXT: vcvttss2si %xmm3, %rax
; BF16-NEXT: vmovq %rax, %xmm3
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2si %xmm0, %rax
; BF16-NEXT: vmovq %rax, %xmm0
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; BF16-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; BF16-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptosi_v8bf16_to_v8i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX10_2-NEXT: vpslld $16, %ymm0, %ymm0
; AVX10_2-NEXT: vcvttps2qq %ymm0, %zmm0
; AVX10_2-NEXT: retq
%cvt = fptosi <8 x bfloat> %a to <8 x i64>
ret <8 x i64> %cvt
}
;
; BFloat to Unsigned Integer
;
define i32 @fptoui_bf16_to_i32(bfloat %a) {
; BF16-LABEL: fptoui_bf16_to_i32:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $0, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2usi %xmm0, %eax
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptoui_bf16_to_i32:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vmovw %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm0
; AVX10_2-NEXT: vcvttss2usi %xmm0, %eax
; AVX10_2-NEXT: retq
%cvt = fptoui bfloat %a to i32
ret i32 %cvt
}
define i64 @fptoui_bf16_to_i64(bfloat %a) {
; BF16-LABEL: fptoui_bf16_to_i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $0, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2usi %xmm0, %rax
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptoui_bf16_to_i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vmovw %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm0
; AVX10_2-NEXT: vcvttss2usi %xmm0, %rax
; AVX10_2-NEXT: retq
%cvt = fptoui bfloat %a to i64
ret i64 %cvt
}
define <8 x i32> @fptoui_v8bf16_to_v8i32(<8 x bfloat> %a) {
; CHECK-LABEL: fptoui_v8bf16_to_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpslld $16, %ymm0, %ymm0
; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0
; CHECK-NEXT: retq
%cvt = fptoui <8 x bfloat> %a to <8 x i32>
ret <8 x i32> %cvt
}
define <4 x i32> @fptoui_v4bf16_to_v4i32(<4 x bfloat> %a) {
; CHECK-LABEL: fptoui_v4bf16_to_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0
; CHECK-NEXT: retq
%cvt = fptoui <4 x bfloat> %a to <4 x i32>
ret <4 x i32> %cvt
}
define <2 x i32> @fptoui_v2bf16_to_v2i32(<2 x bfloat> %a) {
; BF16-LABEL: fptoui_v2bf16_to_v2i32:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; BF16-NEXT: vcvttps2udq %xmm0, %xmm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptoui_v2bf16_to_v2i32:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vmovw %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm1
; AVX10_2-NEXT: vpextrw $1, %xmm0, %eax
; AVX10_2-NEXT: shll $16, %eax
; AVX10_2-NEXT: vmovd %eax, %xmm0
; AVX10_2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
; AVX10_2-NEXT: vcvttps2udq %xmm0, %xmm0
; AVX10_2-NEXT: retq
%cvt = fptoui <2 x bfloat> %a to <2 x i32>
ret <2 x i32> %cvt
}
define <16 x i32> @fptoui_v16bf16_to_v16i32(<16 x bfloat> %a) {
; CHECK-LABEL: fptoui_v16bf16_to_v16i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; CHECK-NEXT: vpslld $16, %zmm0, %zmm0
; CHECK-NEXT: vcvttps2udq %zmm0, %zmm0
; CHECK-NEXT: retq
%cvt = fptoui <16 x bfloat> %a to <16 x i32>
ret <16 x i32> %cvt
}
define <2 x i64> @fptoui_v2bf16_to_v2i64(<2 x bfloat> %a) {
; BF16-LABEL: fptoui_v2bf16_to_v2i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vcvttss2usi %xmm1, %rax
; BF16-NEXT: vmovq %rax, %xmm1
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2usi %xmm0, %rax
; BF16-NEXT: vmovq %rax, %xmm0
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptoui_v2bf16_to_v2i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX10_2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX10_2-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX10_2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX10_2-NEXT: vzeroupper
; AVX10_2-NEXT: retq
%cvt = fptoui <2 x bfloat> %a to <2 x i64>
ret <2 x i64> %cvt
}
define <4 x i64> @fptoui_v4bf16_to_v4i64(<4 x bfloat> %a) {
; BF16-LABEL: fptoui_v4bf16_to_v4i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $3, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vcvttss2usi %xmm1, %rax
; BF16-NEXT: vmovq %rax, %xmm1
; BF16-NEXT: vpextrw $2, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2usi %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2usi %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2usi %xmm0, %rax
; BF16-NEXT: vmovq %rax, %xmm0
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; BF16-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptoui_v4bf16_to_v4i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX10_2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX10_2-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX10_2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX10_2-NEXT: retq
%cvt = fptoui <4 x bfloat> %a to <4 x i64>
ret <4 x i64> %cvt
}
define <8 x i64> @fptoui_v8bf16_to_v8i64(<8 x bfloat> %a) {
; BF16-LABEL: fptoui_v8bf16_to_v8i64:
; BF16: # %bb.0:
; BF16-NEXT: vpextrw $7, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm1
; BF16-NEXT: vcvttss2usi %xmm1, %rax
; BF16-NEXT: vmovq %rax, %xmm1
; BF16-NEXT: vpextrw $6, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2usi %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; BF16-NEXT: vpextrw $5, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2usi %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpextrw $4, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm3
; BF16-NEXT: vcvttss2usi %xmm3, %rax
; BF16-NEXT: vmovq %rax, %xmm3
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; BF16-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; BF16-NEXT: vpextrw $3, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm2
; BF16-NEXT: vcvttss2usi %xmm2, %rax
; BF16-NEXT: vmovq %rax, %xmm2
; BF16-NEXT: vpextrw $2, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm3
; BF16-NEXT: vcvttss2usi %xmm3, %rax
; BF16-NEXT: vmovq %rax, %xmm3
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; BF16-NEXT: vpextrw $1, %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm3
; BF16-NEXT: vcvttss2usi %xmm3, %rax
; BF16-NEXT: vmovq %rax, %xmm3
; BF16-NEXT: vmovd %xmm0, %eax
; BF16-NEXT: shll $16, %eax
; BF16-NEXT: vmovd %eax, %xmm0
; BF16-NEXT: vcvttss2usi %xmm0, %rax
; BF16-NEXT: vmovq %rax, %xmm0
; BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; BF16-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; BF16-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; BF16-NEXT: retq
;
; AVX10_2-LABEL: fptoui_v8bf16_to_v8i64:
; AVX10_2: # %bb.0:
; AVX10_2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX10_2-NEXT: vpslld $16, %ymm0, %ymm0
; AVX10_2-NEXT: vcvttps2uqq %ymm0, %zmm0
; AVX10_2-NEXT: retq
%cvt = fptoui <8 x bfloat> %a to <8 x i64>
ret <8 x i64> %cvt
}