blob: 150b763a17cd6ebcc658e5922273b2fa485e1ff1 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86
define <8 x half> @broadcastph128(half* %x) {
; X64-LABEL: broadcastph128:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: broadcastph128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpbroadcastw (%eax), %xmm0
; X86-NEXT: retl
%l1 = load half, half* %x, align 2
%vec = insertelement <8 x half> undef, half %l1, i32 0
%res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
ret <8 x half> %res
}
define <16 x half> @broadcastph256(half* %x) {
; X64-LABEL: broadcastph256:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw (%rdi), %ymm0
; X64-NEXT: retq
;
; X86-LABEL: broadcastph256:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpbroadcastw (%eax), %ymm0
; X86-NEXT: retl
%l1 = load half, half* %x, align 2
%vec = insertelement <16 x half> undef, half %l1, i32 0
%res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
ret <16 x half> %res
}
define <32 x half> @broadcastph512(half* %x) {
; X64-LABEL: broadcastph512:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw (%rdi), %zmm0
; X64-NEXT: retq
;
; X86-LABEL: broadcastph512:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpbroadcastw (%eax), %zmm0
; X86-NEXT: retl
%l1 = load half, half* %x, align 2
%vec = insertelement <32 x half> undef, half %l1, i32 0
%res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
ret <32 x half> %res
}
define <8 x half> @broadcastph128_scalar(half %x) {
; X64-LABEL: broadcastph128_scalar:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: broadcastph128_scalar:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%vec = insertelement <8 x half> undef, half %x, i32 0
%res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
ret <8 x half> %res
}
define <16 x half> @broadcastph256_scalar(half %x) {
; X64-LABEL: broadcastph256_scalar:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %ymm0
; X64-NEXT: retq
;
; X86-LABEL: broadcastph256_scalar:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0
; X86-NEXT: retl
%vec = insertelement <16 x half> undef, half %x, i32 0
%res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
ret <16 x half> %res
}
define <32 x half> @broadcastph512_scalar(half %x) {
; X64-LABEL: broadcastph512_scalar:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %zmm0
; X64-NEXT: retq
;
; X86-LABEL: broadcastph512_scalar:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0
; X86-NEXT: retl
%vec = insertelement <32 x half> undef, half %x, i32 0
%res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
ret <32 x half> %res
}
define <8 x half> @broadcastph128_reg(<8 x half> %x) {
; CHECK-LABEL: broadcastph128_reg:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer
ret <8 x half> %res
}
define <16 x half> @broadcastph256_reg(<16 x half> %x) {
; CHECK-LABEL: broadcastph256_reg:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer
ret <16 x half> %res
}
define <32 x half> @broadcastph512_reg(<32 x half> %x) {
; CHECK-LABEL: broadcastph512_reg:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer
ret <32 x half> %res
}
define i16 @test1(half %x) {
; X64-LABEL: test1:
; X64: # %bb.0:
; X64-NEXT: vmovw %xmm0, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X86-LABEL: test1:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
%res = bitcast half %x to i16
ret i16 %res
}
define <8 x i16> @test2(i16 %x) {
; X64-LABEL: test2:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test2:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <8 x i16>undef, i16 %x, i32 0
ret <8 x i16>%res
}
define <8 x i16> @test4(i16* %x) {
; X64-LABEL: test4:
; X64: # %bb.0:
; X64-NEXT: vpbroadcastw (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test4:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpbroadcastw (%eax), %xmm0
; X86-NEXT: retl
%y = load i16, i16* %x
%res = insertelement <8 x i16>undef, i16 %y, i32 0
ret <8 x i16>%res
}
define void @test5(half %x, half* %y) {
; X64-LABEL: test5:
; X64: # %bb.0:
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: test5:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
store half %x, half* %y, align 2
ret void
}
define half @test7(i16* %x) {
; X64-LABEL: test7:
; X64: # %bb.0:
; X64-NEXT: vmovsh (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test7:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsh (%eax), %xmm0
; X86-NEXT: retl
%y = load i16, i16* %x
%res = bitcast i16 %y to half
ret half %res
}
define <8 x i16> @test10(i16* %x) {
; X64-LABEL: test10:
; X64: # %bb.0:
; X64-NEXT: vmovw (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test10:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovw (%eax), %xmm0
; X86-NEXT: retl
%y = load i16, i16* %x, align 2
%res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0
ret <8 x i16>%res
}
define <16 x i16> @test10b(i16* %x) {
; X64-LABEL: test10b:
; X64: # %bb.0:
; X64-NEXT: vmovw (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test10b:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovw (%eax), %xmm0
; X86-NEXT: retl
%y = load i16, i16* %x, align 2
%res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0
ret <16 x i16>%res
}
define <32 x i16> @test10c(i16* %x) {
; X64-LABEL: test10c:
; X64: # %bb.0:
; X64-NEXT: vmovw (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test10c:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovw (%eax), %xmm0
; X86-NEXT: retl
%y = load i16, i16* %x, align 2
%res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0
ret <32 x i16>%res
}
define <8 x half> @test11(half* %x) {
; X64-LABEL: test11:
; X64: # %bb.0:
; X64-NEXT: vmovsh (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test11:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsh (%eax), %xmm0
; X86-NEXT: retl
%y = load half, half* %x, align 2
%res = insertelement <8 x half>zeroinitializer, half %y, i32 0
ret <8 x half>%res
}
define <16 x half> @test11b(half* %x) {
; X64-LABEL: test11b:
; X64: # %bb.0:
; X64-NEXT: vmovsh (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test11b:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsh (%eax), %xmm0
; X86-NEXT: retl
%y = load half, half* %x, align 2
%res = insertelement <16 x half>zeroinitializer, half %y, i32 0
ret <16 x half>%res
}
define <32 x half> @test11c(half* %x) {
; X64-LABEL: test11c:
; X64: # %bb.0:
; X64-NEXT: vmovsh (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test11c:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsh (%eax), %xmm0
; X86-NEXT: retl
%y = load half, half* %x, align 2
%res = insertelement <32 x half>zeroinitializer, half %y, i32 0
ret <32 x half>%res
}
define <8 x half> @test14(half %x) {
; X64-LABEL: test14:
; X64: # %bb.0:
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test14:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <8 x half>zeroinitializer, half %x, i32 0
ret <8 x half>%res
}
define <16 x half> @test14b(half %x) {
; X64-LABEL: test14b:
; X64: # %bb.0:
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test14b:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <16 x half>zeroinitializer, half %x, i32 0
ret <16 x half>%res
}
define <32 x half> @test14c(half %x) {
; X64-LABEL: test14c:
; X64: # %bb.0:
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test14c:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <32 x half>zeroinitializer, half %x, i32 0
ret <32 x half>%res
}
define <8 x i16> @test15(i16 %x) {
; X64-LABEL: test15:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test15:
; X86: # %bb.0:
; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0
ret <8 x i16>%res
}
define <16 x i16> @test16(i16 %x) {
; X64-LABEL: test16:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test16:
; X86: # %bb.0:
; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0
ret <16 x i16>%res
}
define <32 x i16> @test17(i16 %x) {
; X64-LABEL: test17:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test17:
; X86: # %bb.0:
; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0
ret <32 x i16>%res
}
define <8 x i16> @test18(i16 %x) {
; X64-LABEL: test18:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test18:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = insertelement <8 x i16> undef, i16 %x, i32 0
ret <8 x i16>%res
}
define <16 x i16> @test19(i16 %x) {
; X64-LABEL: test19:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test19:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0
; X86-NEXT: retl
%res = insertelement <16 x i16> undef, i16 %x, i32 0
ret <16 x i16>%res
}
define <32 x i16> @test20(i16 %x) {
; X64-LABEL: test20:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test20:
; X86: # %bb.0:
; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0
; X86-NEXT: retl
%res = insertelement <32 x i16> undef, i16 %x, i32 0
ret <32 x i16>%res
}
@g8f16 = external global <8 x half>
@g8f16u = external global <8 x half>, align 8
@g16f16 = external global <16 x half>
@g16f16u = external global <16 x half>, align 8
@g32f16 = external global <32 x half>
@g32f16u = external global <32 x half>, align 8
define <32 x half> @load32f16(<32 x half>* %a) {
; X64-LABEL: load32f16:
; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %zmm0
; X64-NEXT: retq
;
; X86-LABEL: load32f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovaps (%eax), %zmm0
; X86-NEXT: retl
%res = load <32 x half>, <32 x half>* %a
ret <32 x half> %res
}
define <32 x half> @load32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) {
; X64-LABEL: load32f16mask:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: load32f16mask:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT: retl
%msk = bitcast i32 %c to <32 x i1>
%res0 = load <32 x half>, <32 x half>* %a
%res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
ret <32 x half> %res
}
define <32 x half> @load32f16maskz(<32 x half>* %a, i32 %c) {
; X64-LABEL: load32f16maskz:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: load32f16maskz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
%msk = bitcast i32 %c to <32 x i1>
%res0 = load <32 x half>, <32 x half>* %a
%res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
ret <32 x half> %res
}
define <32 x half> @loadu32f16(<32 x half>* %a) {
; X64-LABEL: loadu32f16:
; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %zmm0
; X64-NEXT: retq
;
; X86-LABEL: loadu32f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovups (%eax), %zmm0
; X86-NEXT: retl
%res = load <32 x half>, <32 x half>* %a, align 8
ret <32 x half> %res
}
define <32 x half> @loadu32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) {
; X64-LABEL: loadu32f16mask:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: loadu32f16mask:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT: retl
%msk = bitcast i32 %c to <32 x i1>
%res0 = load <32 x half>, <32 x half>* %a, align 8
%res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
ret <32 x half> %res
}
define <32 x half> @loadu32f16maskz(<32 x half>* %a, i32 %c) {
; X64-LABEL: loadu32f16maskz:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: loadu32f16maskz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
%msk = bitcast i32 %c to <32 x i1>
%res0 = load <32 x half>, <32 x half>* %a, align 8
%res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
ret <32 x half> %res
}
define void @store32f16(<32 x half> %a) {
; X64-LABEL: store32f16:
; X64: # %bb.0:
; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax
; X64-NEXT: vmovaps %zmm0, (%rax)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: store32f16:
; X86: # %bb.0:
; X86-NEXT: vmovaps %zmm0, g32f16
; X86-NEXT: vzeroupper
; X86-NEXT: retl
store <32 x half> %a, <32 x half>* @g32f16
ret void
}
define void @storeu32f16(<32 x half> %a) {
; X64-LABEL: storeu32f16:
; X64: # %bb.0:
; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax
; X64-NEXT: vmovups %zmm0, (%rax)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: storeu32f16:
; X86: # %bb.0:
; X86-NEXT: vmovups %zmm0, g32f16u
; X86-NEXT: vzeroupper
; X86-NEXT: retl
store <32 x half> %a, <32 x half>* @g32f16u, align 8
ret void
}
declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>)
declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>)
define void @storeu32f16mask(<32 x i1> %mask, <32 x half>* %addr, <32 x half> %val) {
; X64-LABEL: storeu32f16mask:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %ymm0, %ymm0
; X64-NEXT: vpmovb2m %ymm0, %k1
; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: storeu32f16mask:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %ymm0, %ymm0
; X86-NEXT: vpmovb2m %ymm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %addr, i32 4, <32 x i1>%mask)
ret void
}
define <32 x half> @maskloadu32f16(<32 x half>* %addr, <32 x half> %val, <32 x i1> %mask) {
; X64-LABEL: maskloadu32f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %ymm1, %ymm1
; X64-NEXT: vpmovb2m %ymm1, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: maskloadu32f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %ymm1, %ymm1
; X86-NEXT: vpmovb2m %ymm1, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; X86-NEXT: retl
%res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
ret <32 x half> %res
}
define <32 x half> @maskuloadu32f16(<32 x half>* %addr, <32 x i1> %mask) {
; X64-LABEL: maskuloadu32f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %ymm0, %ymm0
; X64-NEXT: vpmovb2m %ymm0, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: maskuloadu32f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %ymm0, %ymm0
; X86-NEXT: vpmovb2m %ymm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
%res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
ret <32 x half> %res
}
define <32 x half> @maskzloadu32f16(<32 x half>* %addr, <32 x i1> %mask) {
; X64-LABEL: maskzloadu32f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %ymm0, %ymm0
; X64-NEXT: vpmovb2m %ymm0, %k1
; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: maskzloadu32f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %ymm0, %ymm0
; X86-NEXT: vpmovb2m %ymm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
%res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
ret <32 x half> %res
}
define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) {
; CHECK-LABEL: movrr32f16:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: ret{{[l|q]}}
ret <32 x half> %b
}
define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) {
; X64-LABEL: movrrk32f16:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: movrrk32f16:
; X86: # %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; X86-NEXT: retl
%mask = bitcast i32 %msk to <32 x i1>
%res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b
ret <32 x half> %res
}
define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) {
; X64-LABEL: movrrkz32f16:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: movrrkz32f16:
; X86: # %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; X86-NEXT: retl
%mask = bitcast i32 %msk to <32 x i1>
%res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer
ret <32 x half> %res
}
define <16 x half> @load16f16(<16 x half>* %a) {
; X64-LABEL: load16f16:
; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: retq
;
; X86-LABEL: load16f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovaps (%eax), %ymm0
; X86-NEXT: retl
%res = load <16 x half>, <16 x half>* %a
ret <16 x half> %res
}
define <16 x half> @load16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) {
; X64-LABEL: load16f16mask:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: load16f16mask:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT: retl
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, <16 x half>* %a
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
ret <16 x half> %res
}
define <16 x half> @load16f16maskz(<16 x half>* %a, i16 %c) {
; X64-LABEL: load16f16maskz:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: load16f16maskz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, <16 x half>* %a
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
ret <16 x half> %res
}
define <16 x half> @loadu16f16(<16 x half>* %a) {
; X64-LABEL: loadu16f16:
; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %ymm0
; X64-NEXT: retq
;
; X86-LABEL: loadu16f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovups (%eax), %ymm0
; X86-NEXT: retl
%res = load <16 x half>, <16 x half>* %a, align 8
ret <16 x half> %res
}
define <16 x half> @loadu16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) {
; X64-LABEL: loadu16f16mask:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: loadu16f16mask:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT: retl
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, <16 x half>* %a, align 8
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
ret <16 x half> %res
}
define <16 x half> @loadu16f16maskz(<16 x half>* %a, i16 %c) {
; X64-LABEL: loadu16f16maskz:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: loadu16f16maskz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
%msk = bitcast i16 %c to <16 x i1>
%res0 = load <16 x half>, <16 x half>* %a, align 8
%res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
ret <16 x half> %res
}
define void @store16f16(<16 x half> %a) {
; X64-LABEL: store16f16:
; X64: # %bb.0:
; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax
; X64-NEXT: vmovaps %ymm0, (%rax)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: store16f16:
; X86: # %bb.0:
; X86-NEXT: vmovaps %ymm0, g16f16
; X86-NEXT: vzeroupper
; X86-NEXT: retl
store <16 x half> %a, <16 x half>* @g16f16
ret void
}
define void @storeu16f16(<16 x half> %a) {
; X64-LABEL: storeu16f16:
; X64: # %bb.0:
; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax
; X64-NEXT: vmovups %ymm0, (%rax)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: storeu16f16:
; X86: # %bb.0:
; X86-NEXT: vmovups %ymm0, g16f16u
; X86-NEXT: vzeroupper
; X86-NEXT: retl
store <16 x half> %a, <16 x half>* @g16f16u, align 8
ret void
}
declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>)
declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>)
define void @storeu16f16mask(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
; X64-LABEL: storeu16f16mask:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %xmm0, %xmm0
; X64-NEXT: vpmovb2m %xmm0, %k1
; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; X86-LABEL: storeu16f16mask:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %xmm0, %xmm0
; X86-NEXT: vpmovb2m %xmm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
ret void
}
define <16 x half> @maskloadu16f16(<16 x half>* %addr, <16 x half> %val, <16 x i1> %mask) {
; X64-LABEL: maskloadu16f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %xmm1, %xmm1
; X64-NEXT: vpmovb2m %xmm1, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: maskloadu16f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %xmm1, %xmm1
; X86-NEXT: vpmovb2m %xmm1, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1}
; X86-NEXT: retl
%res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
ret <16 x half> %res
}
define <16 x half> @maskuloadu16f16(<16 x half>* %addr, <16 x i1> %mask) {
; X64-LABEL: maskuloadu16f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %xmm0, %xmm0
; X64-NEXT: vpmovb2m %xmm0, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: maskuloadu16f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %xmm0, %xmm0
; X86-NEXT: vpmovb2m %xmm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
%res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
ret <16 x half> %res
}
define <16 x half> @maskzloadu16f16(<16 x half>* %addr, <16 x i1> %mask) {
; X64-LABEL: maskzloadu16f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %xmm0, %xmm0
; X64-NEXT: vpmovb2m %xmm0, %k1
; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: maskzloadu16f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $7, %xmm0, %xmm0
; X86-NEXT: vpmovb2m %xmm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
%res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
ret <16 x half> %res
}
define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
; CHECK-LABEL: movrr16f16:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
ret <16 x half> %b
}
define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
; X64-LABEL: movrrk16f16:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: movrrk16f16:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; X86-NEXT: retl
%mask = bitcast i16 %msk to <16 x i1>
%res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
ret <16 x half> %res
}
define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
; X64-LABEL: movrrkz16f16:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: movrrkz16f16:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z}
; X86-NEXT: retl
%mask = bitcast i16 %msk to <16 x i1>
%res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
ret <16 x half> %res
}
define <8 x half> @load8f16(<8 x half>* %a) {
; X64-LABEL: load8f16:
; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: load8f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovaps (%eax), %xmm0
; X86-NEXT: retl
%res = load <8 x half>, <8 x half>* %a
ret <8 x half> %res
}
define <8 x half> @load8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) {
; X64-LABEL: load8f16mask:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: load8f16mask:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT: retl
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, <8 x half>* %a
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
ret <8 x half> %res
}
define <8 x half> @load8f16maskz(<8 x half>* %a, i8 %c) {
; X64-LABEL: load8f16maskz:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: load8f16maskz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, <8 x half>* %a
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
ret <8 x half> %res
}
define <8 x half> @loadu8f16(<8 x half>* %a) {
; X64-LABEL: loadu8f16:
; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %xmm0
; X64-NEXT: retq
;
; X86-LABEL: loadu8f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovups (%eax), %xmm0
; X86-NEXT: retl
%res = load <8 x half>, <8 x half>* %a, align 8
ret <8 x half> %res
}
define <8 x half> @loadu8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) {
; X64-LABEL: loadu8f16mask:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: loadu8f16mask:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT: retl
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, <8 x half>* %a, align 8
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
ret <8 x half> %res
}
define <8 x half> @loadu8f16maskz(<8 x half>* %a, i8 %c) {
; X64-LABEL: loadu8f16maskz:
; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: loadu8f16maskz:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
%msk = bitcast i8 %c to <8 x i1>
%res0 = load <8 x half>, <8 x half>* %a, align 8
%res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
ret <8 x half> %res
}
define void @store8f16(<8 x half> %a) {
; X64-LABEL: store8f16:
; X64: # %bb.0:
; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax
; X64-NEXT: vmovaps %xmm0, (%rax)
; X64-NEXT: retq
;
; X86-LABEL: store8f16:
; X86: # %bb.0:
; X86-NEXT: vmovaps %xmm0, g8f16
; X86-NEXT: retl
store <8 x half> %a, <8 x half>* @g8f16
ret void
}
define void @storeu8f16(<8 x half> %a) {
; X64-LABEL: storeu8f16:
; X64: # %bb.0:
; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax
; X64-NEXT: vmovups %xmm0, (%rax)
; X64-NEXT: retq
;
; X86-LABEL: storeu8f16:
; X86: # %bb.0:
; X86-NEXT: vmovups %xmm0, g8f16u
; X86-NEXT: retl
store <8 x half> %a, <8 x half>* @g8f16u, align 8
ret void
}
declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
define void @storeu8f16mask(<8 x i1> %mask, <8 x half>* %addr, <8 x half> %val) {
; X64-LABEL: storeu8f16mask:
; X64: # %bb.0:
; X64-NEXT: vpsllw $15, %xmm0, %xmm0
; X64-NEXT: vpmovw2m %xmm0, %k1
; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
; X64-NEXT: retq
;
; X86-LABEL: storeu8f16mask:
; X86: # %bb.0:
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1}
; X86-NEXT: retl
call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %addr, i32 4, <8 x i1>%mask)
ret void
}
define <8 x half> @maskloadu8f16(<8 x half>* %addr, <8 x half> %val, <8 x i1> %mask) {
; X64-LABEL: maskloadu8f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $15, %xmm1, %xmm1
; X64-NEXT: vpmovw2m %xmm1, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: maskloadu8f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $15, %xmm1, %xmm1
; X86-NEXT: vpmovw2m %xmm1, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1}
; X86-NEXT: retl
%res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
ret <8 x half> %res
}
define <8 x half> @maskuloadu8f16(<8 x half>* %addr, <8 x i1> %mask) {
; X64-LABEL: maskuloadu8f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $15, %xmm0, %xmm0
; X64-NEXT: vpmovw2m %xmm0, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: maskuloadu8f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
%res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
ret <8 x half> %res
}
define <8 x half> @maskzloadu8f16(<8 x half>* %addr, <8 x i1> %mask) {
; X64-LABEL: maskzloadu8f16:
; X64: # %bb.0:
; X64-NEXT: vpsllw $15, %xmm0, %xmm0
; X64-NEXT: vpmovw2m %xmm0, %k1
; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: maskzloadu8f16:
; X86: # %bb.0:
; X86-NEXT: vpsllw $15, %xmm0, %xmm0
; X86-NEXT: vpmovw2m %xmm0, %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
%res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
ret <8 x half> %res
}
define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: movrr8f16:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
ret <8 x half> %b
}
define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
; X64-LABEL: movrrk8f16:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; X64-NEXT: retq
;
; X86-LABEL: movrrk8f16:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; X86-NEXT: retl
%mask = bitcast i8 %msk to <8 x i1>
%res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
ret <8 x half> %res
}
define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
; X64-LABEL: movrrkz8f16:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: retq
;
; X86-LABEL: movrrkz8f16:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT: retl
%mask = bitcast i8 %msk to <8 x i1>
%res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
ret <8 x half> %res
}
define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: movsh:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
%res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%res = fadd <8 x half> %res1, %res2
ret <8 x half> %res
}
define i16 @test_movw(half %x) {
; X64-LABEL: test_movw:
; X64: # %bb.0:
; X64-NEXT: vmovw %xmm0, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X86-LABEL: test_movw:
; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
%res = bitcast half %x to i16
ret i16 %res
}
define half @test_movw2(i16 %x) {
; X64-LABEL: test_movw2:
; X64: # %bb.0:
; X64-NEXT: vmovw %edi, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test_movw2:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
%res = bitcast i16 %x to half
ret half %res
}
; sext avoids having a truncate in front of the bitcast input due to calling
; convention or i16 op promotion.
define half @test_movw3(i8 %x) {
; X64-LABEL: test_movw3:
; X64: # %bb.0:
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: vmovw %eax, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test_movw3:
; X86: # %bb.0:
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovw %eax, %xmm0
; X86-NEXT: retl
%z = sext i8 %x to i16
%a = bitcast i16 %z to half
ret half %a
}
define half @extract_f16_0(<8 x half> %x) {
; CHECK-LABEL: extract_f16_0:
; CHECK: # %bb.0:
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 0
ret half %res
}
define half @extract_f16_1(<8 x half> %x) {
; CHECK-LABEL: extract_f16_1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 1
ret half %res
}
define half @extract_f16_2(<8 x half> %x) {
; CHECK-LABEL: extract_f16_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 2
ret half %res
}
define half @extract_f16_3(<8 x half> %x) {
; CHECK-LABEL: extract_f16_3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 3
ret half %res
}
define half @extract_f16_4(<8 x half> %x) {
; CHECK-LABEL: extract_f16_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 4
ret half %res
}
define half @extract_f16_5(<8 x half> %x) {
; CHECK-LABEL: extract_f16_5:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 5
ret half %res
}
define half @extract_f16_6(<8 x half> %x) {
; CHECK-LABEL: extract_f16_6:
; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 6
ret half %res
}
define half @extract_f16_7(<8 x half> %x) {
; CHECK-LABEL: extract_f16_7:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x half> %x, i32 7
ret half %res
}
define i16 @extract_i16_0(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovw %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 0
ret i16 %res
}
define i16 @extract_i16_1(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 1
ret i16 %res
}
define i16 @extract_i16_2(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_2:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $2, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 2
ret i16 %res
}
define i16 @extract_i16_3(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_3:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $3, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 3
ret i16 %res
}
define i16 @extract_i16_4(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_4:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $4, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 4
ret i16 %res
}
define i16 @extract_i16_5(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_5:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $5, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 5
ret i16 %res
}
define i16 @extract_i16_6(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_6:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $6, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 6
ret i16 %res
}
define i16 @extract_i16_7(<8 x i16> %x) {
; CHECK-LABEL: extract_i16_7:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $7, %xmm0, %eax
; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 7
ret i16 %res
}
define void @extract_store_f16_0(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_0:
; X64: # %bb.0:
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_0:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 0
store half %res, half* %y
ret void
}
define void @extract_store_f16_1(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_1:
; X64: # %bb.0:
; X64-NEXT: vpsrld $16, %xmm0, %xmm0
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_1:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpsrld $16, %xmm0, %xmm0
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 1
store half %res, half* %y
ret void
}
define void @extract_store_f16_2(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_2:
; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 2
store half %res, half* %y
ret void
}
define void @extract_store_f16_3(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_3:
; X64: # %bb.0:
; X64-NEXT: vpsrlq $48, %xmm0, %xmm0
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_3:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpsrlq $48, %xmm0, %xmm0
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 3
store half %res, half* %y
ret void
}
define void @extract_store_f16_4(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_4:
; X64: # %bb.0:
; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_4:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 4
store half %res, half* %y
ret void
}
define void @extract_store_f16_5(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_5:
; X64: # %bb.0:
; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_5:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 5
store half %res, half* %y
ret void
}
define void @extract_store_f16_6(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_6:
; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_6:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 6
store half %res, half* %y
ret void
}
define void @extract_store_f16_7(<8 x half> %x, half* %y) {
; X64-LABEL: extract_store_f16_7:
; X64: # %bb.0:
; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: vmovsh %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_f16_7:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: vmovsh %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x half> %x, i32 7
store half %res, half* %y
ret void
}
define void @extract_store_i16_0(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_0:
; X64: # %bb.0:
; X64-NEXT: vpextrw $0, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_0:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $0, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 0
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_1(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_1:
; X64: # %bb.0:
; X64-NEXT: vpextrw $1, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_1:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $1, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 1
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_2(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_2:
; X64: # %bb.0:
; X64-NEXT: vpextrw $2, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $2, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 2
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_3(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_3:
; X64: # %bb.0:
; X64-NEXT: vpextrw $3, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_3:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $3, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 3
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_4(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_4:
; X64: # %bb.0:
; X64-NEXT: vpextrw $4, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_4:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $4, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 4
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_5(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_5:
; X64: # %bb.0:
; X64-NEXT: vpextrw $5, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_5:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $5, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 5
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_6(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_6:
; X64: # %bb.0:
; X64-NEXT: vpextrw $6, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_6:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $6, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 6
store i16 %res, i16* %y
ret void
}
define void @extract_store_i16_7(<8 x i16> %x, i16* %y) {
; X64-LABEL: extract_store_i16_7:
; X64: # %bb.0:
; X64-NEXT: vpextrw $7, %xmm0, (%rdi)
; X64-NEXT: retq
;
; X86-LABEL: extract_store_i16_7:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpextrw $7, %xmm0, (%eax)
; X86-NEXT: retl
%res = extractelement <8 x i16> %x, i32 7
store i16 %res, i16* %y
ret void
}
define i32 @extract_zext_i16_0(<8 x i16> %x) {
; CHECK-LABEL: extract_zext_i16_0:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $0, %xmm0, %eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 0
%res2 = zext i16 %res to i32
ret i32 %res2
}
define i32 @extract_zext_i16_1(<8 x i16> %x) {
; CHECK-LABEL: extract_zext_i16_1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpextrw $1, %xmm0, %eax
; CHECK-NEXT: ret{{[l|q]}}
%res = extractelement <8 x i16> %x, i32 1
%res2 = zext i16 %res to i32
ret i32 %res2
}
define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) {
; X64-LABEL: build_vector_xxxxuuuu:
; X64: # %bb.0:
; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
; X64-NEXT: retq
;
; X86-LABEL: build_vector_xxxxuuuu:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; X86-NEXT: retl
%a = insertelement <8 x half> undef, half %a0, i32 0
%b = insertelement <8 x half> %a, half %a1, i32 1
%c = insertelement <8 x half> %b, half %a2, i32 2
%d = insertelement <8 x half> %c, half %a3, i32 3
ret <8 x half> %d
}
define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) {
; X64-LABEL: build_vector_uuuuxxxx:
; X64: # %bb.0:
; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: build_vector_uuuuxxxx:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: vpbroadcastq %xmm0, %xmm0
; X86-NEXT: retl
%a = insertelement <8 x half> undef, half %a0, i32 4
%b = insertelement <8 x half> %a, half %a1, i32 5
%c = insertelement <8 x half> %b, half %a2, i32 6
%d = insertelement <8 x half> %c, half %a3, i32 7
ret <8 x half> %d
}
define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
; X64-LABEL: build_vector_xxxxxxxx:
; X64: # %bb.0:
; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; X64-NEXT: retq
;
; X86-LABEL: build_vector_xxxxxxxx:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3
; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X86-NEXT: retl
%a = insertelement <8 x half> undef, half %a0, i32 0
%b = insertelement <8 x half> %a, half %a1, i32 1
%c = insertelement <8 x half> %b, half %a2, i32 2
%d = insertelement <8 x half> %c, half %a3, i32 3
%e = insertelement <8 x half> %d, half %a4, i32 4
%f = insertelement <8 x half> %e, half %a5, i32 5
%g = insertelement <8 x half> %f, half %a6, i32 6
%h = insertelement <8 x half> %g, half %a7, i32 7
ret <8 x half> %h
}
define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx:
; X64: # %bb.0:
; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X64-NEXT: vpbroadcastq %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: retq
;
; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3
; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
; X86-NEXT: vpbroadcastq %xmm0, %xmm0
; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X86-NEXT: retl
%a = insertelement <16 x half> undef, half %a0, i32 0
%b = insertelement <16 x half> %a, half %a1, i32 1
%c = insertelement <16 x half> %b, half %a2, i32 2
%d = insertelement <16 x half> %c, half %a3, i32 3
%e = insertelement <16 x half> %d, half %a4, i32 12
%f = insertelement <16 x half> %e, half %a5, i32 13
%g = insertelement <16 x half> %f, half %a6, i32 14
%h = insertelement <16 x half> %g, half %a7, i32 15
ret <16 x half> %h
}
define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
; CHECK-LABEL: regression1:
; CHECK: # %bb.0:
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
ret <8 x half> %res
}
define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, i8* %4) {
; X64-LABEL: regression2:
; X64: # %bb.0:
; X64-NEXT: vmovw (%rsi), %xmm0
; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: regression2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovw (%eax), %xmm0
; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
; X86-NEXT: retl
%6 = getelementptr i8, i8* %4, i64 0
%7 = getelementptr i8, i8* %6, i64 0
%8 = getelementptr i8, i8* %7, i64 0
%9 = load i8, i8* %8, align 1
%10 = getelementptr i8, i8* %8, i64 1
%11 = addrspacecast i8* %10 to i8 addrspace(4)*
%12 = load i8, i8 addrspace(4)* %11, align 1
%13 = insertelement <2 x i8> poison, i8 %9, i32 0
%14 = insertelement <2 x i8> %13, i8 %12, i32 1
%15 = uitofp <2 x i8> %14 to <2 x float>
%16 = shufflevector <2 x float> %15, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%17 = shufflevector <4 x float> %16, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
%18 = fmul contract <4 x float> %17, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
ret <4 x float> %18
}
; Make sure load/stores of v4f16 are handled well on 32-bit targets where
; default widening legalization can't use i64.
define void @load_store_v4f16(<4 x half>* %x, <4 x half>* %y, <4 x half>* %z) {
; X64-LABEL: load_store_v4f16:
; X64: # %bb.0:
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0
; X64-NEXT: vmovlps %xmm0, (%rdx)
; X64-NEXT: retq
;
; X86-LABEL: load_store_v4f16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0
; X86-NEXT: vmovlps %xmm0, (%eax)
; X86-NEXT: retl
%a = load <4 x half>, <4 x half>* %x
%b = load <4 x half>, <4 x half>* %y
%c = fadd <4 x half> %a, %b
store <4 x half> %c, <4 x half>* %z
ret void
}
define <8 x half> @test21(half %a, half %b, half %c) nounwind {
; X64-LABEL: test21:
; X64: # %bb.0:
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2
; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %xmm1
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X64-NEXT: retq
;
; X86-LABEL: test21:
; X86: # %bb.0:
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1
; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-NEXT: vpbroadcastw %xmm1, %xmm1
; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X86-NEXT: retl
%1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0
%2 = insertelement <8 x half> %1, half %b, i32 1
%3 = insertelement <8 x half> %2, half %c, i32 2
ret <8 x half> %3
}
define <16 x i16> @test22(i16* %mem) nounwind {
; X64-LABEL: test22:
; X64: # %bb.0:
; X64-NEXT: movzwl 0, %eax
; X64-NEXT: andw (%rdi), %ax
; X64-NEXT: vmovw %eax, %xmm0
; X64-NEXT: retq
;
; X86-LABEL: test22:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl 0, %ecx
; X86-NEXT: andw (%eax), %cx
; X86-NEXT: vmovw %ecx, %xmm0
; X86-NEXT: retl
%1 = load i16, i16* null, align 2
%2 = load i16, i16* %mem, align 2
%3 = and i16 %1, %2
%4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0
ret <16 x i16> %4
}
define void @pr52560(i8 %0, <2 x i16> %1, i8* %c) nounwind {
; X64-LABEL: pr52560:
; X64: # %bb.0: # %entry
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: vmovw %eax, %xmm1
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT: vpcmpgtw %xmm2, %xmm1, %k1
; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: vmovw %xmm0, %eax
; X64-NEXT: testw %ax, %ax
; X64-NEXT: je .LBB121_2
; X64-NEXT: # %bb.1: # %for.body.preheader
; X64-NEXT: movb $0, (%rsi)
; X64-NEXT: .LBB121_2: # %for.end
; X64-NEXT: retq
;
; X86-LABEL: pr52560:
; X86: # %bb.0: # %entry
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vmovw %eax, %xmm1
; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X86-NEXT: vpcmpgtw %xmm2, %xmm1, %k1
; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; X86-NEXT: vmovw %xmm0, %eax
; X86-NEXT: testw %ax, %ax
; X86-NEXT: je .LBB121_2
; X86-NEXT: # %bb.1: # %for.body.preheader
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb $0, (%eax)
; X86-NEXT: .LBB121_2: # %for.end
; X86-NEXT: retl
entry:
%conv = sext i8 %0 to i16
%2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0
%3 = icmp sgt <2 x i16> %2, zeroinitializer
%4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison>
%5 = extractelement <2 x i16> %4, i32 0
%tobool.not14 = icmp eq i16 %5, 0
br i1 %tobool.not14, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
store i8 0, i8* %c, align 1
br label %for.end
for.end: ; preds = %for.body.preheader, %entry
ret void
}