| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64 |
| ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86 |
| |
| define <8 x half> @broadcastph128(half* %x) { |
| ; X64-LABEL: broadcastph128: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: broadcastph128: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpbroadcastw (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %l1 = load half, half* %x, align 2 |
| %vec = insertelement <8 x half> undef, half %l1, i32 0 |
| %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer |
| ret <8 x half> %res |
| } |
| |
| define <16 x half> @broadcastph256(half* %x) { |
| ; X64-LABEL: broadcastph256: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw (%rdi), %ymm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: broadcastph256: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpbroadcastw (%eax), %ymm0 |
| ; X86-NEXT: retl |
| %l1 = load half, half* %x, align 2 |
| %vec = insertelement <16 x half> undef, half %l1, i32 0 |
| %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer |
| ret <16 x half> %res |
| } |
| |
| define <32 x half> @broadcastph512(half* %x) { |
| ; X64-LABEL: broadcastph512: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw (%rdi), %zmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: broadcastph512: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpbroadcastw (%eax), %zmm0 |
| ; X86-NEXT: retl |
| %l1 = load half, half* %x, align 2 |
| %vec = insertelement <32 x half> undef, half %l1, i32 0 |
| %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer |
| ret <32 x half> %res |
| } |
| |
| define <8 x half> @broadcastph128_scalar(half %x) { |
| ; X64-LABEL: broadcastph128_scalar: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw %xmm0, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: broadcastph128_scalar: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %vec = insertelement <8 x half> undef, half %x, i32 0 |
| %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer |
| ret <8 x half> %res |
| } |
| |
| define <16 x half> @broadcastph256_scalar(half %x) { |
| ; X64-LABEL: broadcastph256_scalar: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw %xmm0, %ymm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: broadcastph256_scalar: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 |
| ; X86-NEXT: retl |
| %vec = insertelement <16 x half> undef, half %x, i32 0 |
| %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer |
| ret <16 x half> %res |
| } |
| |
| define <32 x half> @broadcastph512_scalar(half %x) { |
| ; X64-LABEL: broadcastph512_scalar: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw %xmm0, %zmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: broadcastph512_scalar: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 |
| ; X86-NEXT: retl |
| %vec = insertelement <32 x half> undef, half %x, i32 0 |
| %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer |
| ret <32 x half> %res |
| } |
| |
| define <8 x half> @broadcastph128_reg(<8 x half> %x) { |
| ; CHECK-LABEL: broadcastph128_reg: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer |
| ret <8 x half> %res |
| } |
| |
| define <16 x half> @broadcastph256_reg(<16 x half> %x) { |
| ; CHECK-LABEL: broadcastph256_reg: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer |
| ret <16 x half> %res |
| } |
| |
| define <32 x half> @broadcastph512_reg(<32 x half> %x) { |
| ; CHECK-LABEL: broadcastph512_reg: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer |
| ret <32 x half> %res |
| } |
| |
| define i16 @test1(half %x) { |
| ; X64-LABEL: test1: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %xmm0, %eax |
| ; X64-NEXT: # kill: def $ax killed $ax killed $eax |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test1: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: retl |
| %res = bitcast half %x to i16 |
| ret i16 %res |
| } |
| |
| define <8 x i16> @test2(i16 %x) { |
| ; X64-LABEL: test2: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test2: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <8 x i16>undef, i16 %x, i32 0 |
| ret <8 x i16>%res |
| } |
| |
| define <8 x i16> @test4(i16* %x) { |
| ; X64-LABEL: test4: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpbroadcastw (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test4: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpbroadcastw (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load i16, i16* %x |
| %res = insertelement <8 x i16>undef, i16 %y, i32 0 |
| ret <8 x i16>%res |
| } |
| |
| define void @test5(half %x, half* %y) { |
| ; X64-LABEL: test5: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test5: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| store half %x, half* %y, align 2 |
| ret void |
| } |
| |
| define half @test7(i16* %x) { |
| ; X64-LABEL: test7: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsh (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test7: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovsh (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load i16, i16* %x |
| %res = bitcast i16 %y to half |
| ret half %res |
| } |
| |
| define <8 x i16> @test10(i16* %x) { |
| ; X64-LABEL: test10: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test10: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovw (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load i16, i16* %x, align 2 |
| %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0 |
| ret <8 x i16>%res |
| } |
| |
| define <16 x i16> @test10b(i16* %x) { |
| ; X64-LABEL: test10b: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test10b: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovw (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load i16, i16* %x, align 2 |
| %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0 |
| ret <16 x i16>%res |
| } |
| |
| define <32 x i16> @test10c(i16* %x) { |
| ; X64-LABEL: test10c: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test10c: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovw (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load i16, i16* %x, align 2 |
| %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0 |
| ret <32 x i16>%res |
| } |
| |
| define <8 x half> @test11(half* %x) { |
| ; X64-LABEL: test11: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsh (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test11: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovsh (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load half, half* %x, align 2 |
| %res = insertelement <8 x half>zeroinitializer, half %y, i32 0 |
| ret <8 x half>%res |
| } |
| |
| define <16 x half> @test11b(half* %x) { |
| ; X64-LABEL: test11b: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsh (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test11b: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovsh (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load half, half* %x, align 2 |
| %res = insertelement <16 x half>zeroinitializer, half %y, i32 0 |
| ret <16 x half>%res |
| } |
| |
| define <32 x half> @test11c(half* %x) { |
| ; X64-LABEL: test11c: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsh (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test11c: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovsh (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %y = load half, half* %x, align 2 |
| %res = insertelement <32 x half>zeroinitializer, half %y, i32 0 |
| ret <32 x half>%res |
| } |
| |
| define <8 x half> @test14(half %x) { |
| ; X64-LABEL: test14: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test14: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <8 x half>zeroinitializer, half %x, i32 0 |
| ret <8 x half>%res |
| } |
| |
| define <16 x half> @test14b(half %x) { |
| ; X64-LABEL: test14b: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test14b: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <16 x half>zeroinitializer, half %x, i32 0 |
| ret <16 x half>%res |
| } |
| |
| define <32 x half> @test14c(half %x) { |
| ; X64-LABEL: test14c: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 |
| ; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test14c: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <32 x half>zeroinitializer, half %x, i32 0 |
| ret <32 x half>%res |
| } |
| |
| define <8 x i16> @test15(i16 %x) { |
| ; X64-LABEL: test15: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test15: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0 |
| ret <8 x i16>%res |
| } |
| |
| define <16 x i16> @test16(i16 %x) { |
| ; X64-LABEL: test16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0 |
| ret <16 x i16>%res |
| } |
| |
| define <32 x i16> @test17(i16 %x) { |
| ; X64-LABEL: test17: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test17: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0 |
| ret <32 x i16>%res |
| } |
| |
| define <8 x i16> @test18(i16 %x) { |
| ; X64-LABEL: test18: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test18: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <8 x i16> undef, i16 %x, i32 0 |
| ret <8 x i16>%res |
| } |
| |
| define <16 x i16> @test19(i16 %x) { |
| ; X64-LABEL: test19: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test19: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 |
| ; X86-NEXT: retl |
| %res = insertelement <16 x i16> undef, i16 %x, i32 0 |
| ret <16 x i16>%res |
| } |
| |
| define <32 x i16> @test20(i16 %x) { |
| ; X64-LABEL: test20: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test20: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 |
| ; X86-NEXT: retl |
| %res = insertelement <32 x i16> undef, i16 %x, i32 0 |
| ret <32 x i16>%res |
| } |
| |
| @g8f16 = external global <8 x half> |
| @g8f16u = external global <8 x half>, align 8 |
| @g16f16 = external global <16 x half> |
| @g16f16u = external global <16 x half>, align 8 |
| @g32f16 = external global <32 x half> |
| @g32f16u = external global <32 x half>, align 8 |
| |
| define <32 x half> @load32f16(<32 x half>* %a) { |
| ; X64-LABEL: load32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovaps (%rdi), %zmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovaps (%eax), %zmm0 |
| ; X86-NEXT: retl |
| %res = load <32 x half>, <32 x half>* %a |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @load32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) { |
| ; X64-LABEL: load32f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load32f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} |
| ; X86-NEXT: retl |
| %msk = bitcast i32 %c to <32 x i1> |
| %res0 = load <32 x half>, <32 x half>* %a |
| %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @load32f16maskz(<32 x half>* %a, i32 %c) { |
| ; X64-LABEL: load32f16maskz: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load32f16maskz: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %msk = bitcast i32 %c to <32 x i1> |
| %res0 = load <32 x half>, <32 x half>* %a |
| %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @loadu32f16(<32 x half>* %a) { |
| ; X64-LABEL: loadu32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovups (%rdi), %zmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovups (%eax), %zmm0 |
| ; X86-NEXT: retl |
| %res = load <32 x half>, <32 x half>* %a, align 8 |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @loadu32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) { |
| ; X64-LABEL: loadu32f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu32f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} |
| ; X86-NEXT: retl |
| %msk = bitcast i32 %c to <32 x i1> |
| %res0 = load <32 x half>, <32 x half>* %a, align 8 |
| %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @loadu32f16maskz(<32 x half>* %a, i32 %c) { |
| ; X64-LABEL: loadu32f16maskz: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu32f16maskz: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %msk = bitcast i32 %c to <32 x i1> |
| %res0 = load <32 x half>, <32 x half>* %a, align 8 |
| %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer |
| ret <32 x half> %res |
| } |
| |
| define void @store32f16(<32 x half> %a) { |
| ; X64-LABEL: store32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax |
| ; X64-NEXT: vmovaps %zmm0, (%rax) |
| ; X64-NEXT: vzeroupper |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: store32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovaps %zmm0, g32f16 |
| ; X86-NEXT: vzeroupper |
| ; X86-NEXT: retl |
| store <32 x half> %a, <32 x half>* @g32f16 |
| ret void |
| } |
| |
| define void @storeu32f16(<32 x half> %a) { |
| ; X64-LABEL: storeu32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax |
| ; X64-NEXT: vmovups %zmm0, (%rax) |
| ; X64-NEXT: vzeroupper |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: storeu32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovups %zmm0, g32f16u |
| ; X86-NEXT: vzeroupper |
| ; X86-NEXT: retl |
| store <32 x half> %a, <32 x half>* @g32f16u, align 8 |
| ret void |
| } |
| |
| declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>) |
| declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>) |
| |
| define void @storeu32f16mask(<32 x i1> %mask, <32 x half>* %addr, <32 x half> %val) { |
| ; X64-LABEL: storeu32f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %ymm0, %ymm0 |
| ; X64-NEXT: vpmovb2m %ymm0, %k1 |
| ; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} |
| ; X64-NEXT: vzeroupper |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: storeu32f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %ymm0, %ymm0 |
| ; X86-NEXT: vpmovb2m %ymm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1} |
| ; X86-NEXT: vzeroupper |
| ; X86-NEXT: retl |
| call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %addr, i32 4, <32 x i1>%mask) |
| ret void |
| } |
| |
| define <32 x half> @maskloadu32f16(<32 x half>* %addr, <32 x half> %val, <32 x i1> %mask) { |
| ; X64-LABEL: maskloadu32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %ymm1, %ymm1 |
| ; X64-NEXT: vpmovb2m %ymm1, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskloadu32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %ymm1, %ymm1 |
| ; X86-NEXT: vpmovb2m %ymm1, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} |
| ; X86-NEXT: retl |
| %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> %val) |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @maskuloadu32f16(<32 x half>* %addr, <32 x i1> %mask) { |
| ; X64-LABEL: maskuloadu32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %ymm0, %ymm0 |
| ; X64-NEXT: vpmovb2m %ymm0, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskuloadu32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %ymm0, %ymm0 |
| ; X86-NEXT: vpmovb2m %ymm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> undef) |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @maskzloadu32f16(<32 x half>* %addr, <32 x i1> %mask) { |
| ; X64-LABEL: maskzloadu32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %ymm0, %ymm0 |
| ; X64-NEXT: vpmovb2m %ymm0, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskzloadu32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %ymm0, %ymm0 |
| ; X86-NEXT: vpmovb2m %ymm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer) |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) { |
| ; CHECK-LABEL: movrr32f16: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %zmm1, %zmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| ret <32 x half> %b |
| } |
| |
| define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) { |
| ; X64-LABEL: movrrk32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %edi, %k1 |
| ; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: movrrk32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} |
| ; X86-NEXT: retl |
| %mask = bitcast i32 %msk to <32 x i1> |
| %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b |
| ret <32 x half> %res |
| } |
| |
| define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) { |
| ; X64-LABEL: movrrkz32f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %edi, %k1 |
| ; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: movrrkz32f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %mask = bitcast i32 %msk to <32 x i1> |
| %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer |
| ret <32 x half> %res |
| } |
| |
| define <16 x half> @load16f16(<16 x half>* %a) { |
| ; X64-LABEL: load16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovaps (%rdi), %ymm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovaps (%eax), %ymm0 |
| ; X86-NEXT: retl |
| %res = load <16 x half>, <16 x half>* %a |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @load16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) { |
| ; X64-LABEL: load16f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load16f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} |
| ; X86-NEXT: retl |
| %msk = bitcast i16 %c to <16 x i1> |
| %res0 = load <16 x half>, <16 x half>* %a |
| %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @load16f16maskz(<16 x half>* %a, i16 %c) { |
| ; X64-LABEL: load16f16maskz: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load16f16maskz: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %msk = bitcast i16 %c to <16 x i1> |
| %res0 = load <16 x half>, <16 x half>* %a |
| %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @loadu16f16(<16 x half>* %a) { |
| ; X64-LABEL: loadu16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovups (%rdi), %ymm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovups (%eax), %ymm0 |
| ; X86-NEXT: retl |
| %res = load <16 x half>, <16 x half>* %a, align 8 |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @loadu16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) { |
| ; X64-LABEL: loadu16f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu16f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} |
| ; X86-NEXT: retl |
| %msk = bitcast i16 %c to <16 x i1> |
| %res0 = load <16 x half>, <16 x half>* %a, align 8 |
| %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @loadu16f16maskz(<16 x half>* %a, i16 %c) { |
| ; X64-LABEL: loadu16f16maskz: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu16f16maskz: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %msk = bitcast i16 %c to <16 x i1> |
| %res0 = load <16 x half>, <16 x half>* %a, align 8 |
| %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer |
| ret <16 x half> %res |
| } |
| |
| define void @store16f16(<16 x half> %a) { |
| ; X64-LABEL: store16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax |
| ; X64-NEXT: vmovaps %ymm0, (%rax) |
| ; X64-NEXT: vzeroupper |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: store16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovaps %ymm0, g16f16 |
| ; X86-NEXT: vzeroupper |
| ; X86-NEXT: retl |
| store <16 x half> %a, <16 x half>* @g16f16 |
| ret void |
| } |
| |
| define void @storeu16f16(<16 x half> %a) { |
| ; X64-LABEL: storeu16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax |
| ; X64-NEXT: vmovups %ymm0, (%rax) |
| ; X64-NEXT: vzeroupper |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: storeu16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovups %ymm0, g16f16u |
| ; X86-NEXT: vzeroupper |
| ; X86-NEXT: retl |
| store <16 x half> %a, <16 x half>* @g16f16u, align 8 |
| ret void |
| } |
| |
| declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>) |
| declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>) |
| |
| define void @storeu16f16mask(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) { |
| ; X64-LABEL: storeu16f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %xmm0, %xmm0 |
| ; X64-NEXT: vpmovb2m %xmm0, %k1 |
| ; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} |
| ; X64-NEXT: vzeroupper |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: storeu16f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %xmm0, %xmm0 |
| ; X86-NEXT: vpmovb2m %xmm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1} |
| ; X86-NEXT: vzeroupper |
| ; X86-NEXT: retl |
| call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) |
| ret void |
| } |
| |
| define <16 x half> @maskloadu16f16(<16 x half>* %addr, <16 x half> %val, <16 x i1> %mask) { |
| ; X64-LABEL: maskloadu16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %xmm1, %xmm1 |
| ; X64-NEXT: vpmovb2m %xmm1, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskloadu16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %xmm1, %xmm1 |
| ; X86-NEXT: vpmovb2m %xmm1, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} |
| ; X86-NEXT: retl |
| %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> %val) |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @maskuloadu16f16(<16 x half>* %addr, <16 x i1> %mask) { |
| ; X64-LABEL: maskuloadu16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %xmm0, %xmm0 |
| ; X64-NEXT: vpmovb2m %xmm0, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskuloadu16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %xmm0, %xmm0 |
| ; X86-NEXT: vpmovb2m %xmm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> undef) |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @maskzloadu16f16(<16 x half>* %addr, <16 x i1> %mask) { |
| ; X64-LABEL: maskzloadu16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $7, %xmm0, %xmm0 |
| ; X64-NEXT: vpmovb2m %xmm0, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskzloadu16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $7, %xmm0, %xmm0 |
| ; X86-NEXT: vpmovb2m %xmm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer) |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) { |
| ; CHECK-LABEL: movrr16f16: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %ymm1, %ymm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| ret <16 x half> %b |
| } |
| |
| define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) { |
| ; X64-LABEL: movrrk16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %edi, %k1 |
| ; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: movrrk16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} |
| ; X86-NEXT: retl |
| %mask = bitcast i16 %msk to <16 x i1> |
| %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b |
| ret <16 x half> %res |
| } |
| |
| define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) { |
| ; X64-LABEL: movrrkz16f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %edi, %k1 |
| ; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: movrrkz16f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %mask = bitcast i16 %msk to <16 x i1> |
| %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer |
| ret <16 x half> %res |
| } |
| |
| define <8 x half> @load8f16(<8 x half>* %a) { |
| ; X64-LABEL: load8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovaps (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovaps (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %res = load <8 x half>, <8 x half>* %a |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @load8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) { |
| ; X64-LABEL: load8f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load8f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} |
| ; X86-NEXT: retl |
| %msk = bitcast i8 %c to <8 x i1> |
| %res0 = load <8 x half>, <8 x half>* %a |
| %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @load8f16maskz(<8 x half>* %a, i8 %c) { |
| ; X64-LABEL: load8f16maskz: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load8f16maskz: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %msk = bitcast i8 %c to <8 x i1> |
| %res0 = load <8 x half>, <8 x half>* %a |
| %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @loadu8f16(<8 x half>* %a) { |
| ; X64-LABEL: loadu8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovups (%rdi), %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovups (%eax), %xmm0 |
| ; X86-NEXT: retl |
| %res = load <8 x half>, <8 x half>* %a, align 8 |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @loadu8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) { |
| ; X64-LABEL: loadu8f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu8f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} |
| ; X86-NEXT: retl |
| %msk = bitcast i8 %c to <8 x i1> |
| %res0 = load <8 x half>, <8 x half>* %a, align 8 |
| %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @loadu8f16maskz(<8 x half>* %a, i8 %c) { |
| ; X64-LABEL: loadu8f16maskz: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %esi, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: loadu8f16maskz: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %msk = bitcast i8 %c to <8 x i1> |
| %res0 = load <8 x half>, <8 x half>* %a, align 8 |
| %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer |
| ret <8 x half> %res |
| } |
| |
| define void @store8f16(<8 x half> %a) { |
| ; X64-LABEL: store8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax |
| ; X64-NEXT: vmovaps %xmm0, (%rax) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: store8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovaps %xmm0, g8f16 |
| ; X86-NEXT: retl |
| store <8 x half> %a, <8 x half>* @g8f16 |
| ret void |
| } |
| |
| define void @storeu8f16(<8 x half> %a) { |
| ; X64-LABEL: storeu8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax |
| ; X64-NEXT: vmovups %xmm0, (%rax) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: storeu8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovups %xmm0, g8f16u |
| ; X86-NEXT: retl |
| store <8 x half> %a, <8 x half>* @g8f16u, align 8 |
| ret void |
| } |
| |
| declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) |
| declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) |
| |
| define void @storeu8f16mask(<8 x i1> %mask, <8 x half>* %addr, <8 x half> %val) { |
| ; X64-LABEL: storeu8f16mask: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $15, %xmm0, %xmm0 |
| ; X64-NEXT: vpmovw2m %xmm0, %k1 |
| ; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: storeu8f16mask: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $15, %xmm0, %xmm0 |
| ; X86-NEXT: vpmovw2m %xmm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1} |
| ; X86-NEXT: retl |
| call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %addr, i32 4, <8 x i1>%mask) |
| ret void |
| } |
| |
| define <8 x half> @maskloadu8f16(<8 x half>* %addr, <8 x half> %val, <8 x i1> %mask) { |
| ; X64-LABEL: maskloadu8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $15, %xmm1, %xmm1 |
| ; X64-NEXT: vpmovw2m %xmm1, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskloadu8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $15, %xmm1, %xmm1 |
| ; X86-NEXT: vpmovw2m %xmm1, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} |
| ; X86-NEXT: retl |
| %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> %val) |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @maskuloadu8f16(<8 x half>* %addr, <8 x i1> %mask) { |
| ; X64-LABEL: maskuloadu8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $15, %xmm0, %xmm0 |
| ; X64-NEXT: vpmovw2m %xmm0, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskuloadu8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $15, %xmm0, %xmm0 |
| ; X86-NEXT: vpmovw2m %xmm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> undef) |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @maskzloadu8f16(<8 x half>* %addr, <8 x i1> %mask) { |
| ; X64-LABEL: maskzloadu8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsllw $15, %xmm0, %xmm0 |
| ; X64-NEXT: vpmovw2m %xmm0, %k1 |
| ; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: maskzloadu8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vpsllw $15, %xmm0, %xmm0 |
| ; X86-NEXT: vpmovw2m %xmm0, %k1 |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer) |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) { |
| ; CHECK-LABEL: movrr8f16: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovaps %xmm1, %xmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| ret <8 x half> %b |
| } |
| |
| define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) { |
| ; X64-LABEL: movrrk8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %edi, %k1 |
| ; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: movrrk8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} |
| ; X86-NEXT: retl |
| %mask = bitcast i8 %msk to <8 x i1> |
| %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) { |
| ; X64-LABEL: movrrkz8f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: kmovd %edi, %k1 |
| ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: movrrkz8f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 |
| ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} |
| ; X86-NEXT: retl |
| %mask = bitcast i8 %msk to <8 x i1> |
| %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer |
| ret <8 x half> %res |
| } |
| |
| define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) { |
| ; CHECK-LABEL: movsh: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] |
| ; CHECK-NEXT: vmovsh %xmm0, %xmm1, %xmm0 |
| ; CHECK-NEXT: vaddph %xmm0, %xmm2, %xmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5> |
| %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| %res = fadd <8 x half> %res1, %res2 |
| ret <8 x half> %res |
| } |
| |
| define i16 @test_movw(half %x) { |
| ; X64-LABEL: test_movw: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %xmm0, %eax |
| ; X64-NEXT: # kill: def $ax killed $ax killed $eax |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test_movw: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: retl |
| %res = bitcast half %x to i16 |
| ret i16 %res |
| } |
| |
| define half @test_movw2(i16 %x) { |
| ; X64-LABEL: test_movw2: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw %edi, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test_movw2: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: retl |
| %res = bitcast i16 %x to half |
| ret half %res |
| } |
| |
| ; sext avoids having a truncate in front of the bitcast input due to calling |
| ; convention or i16 op promotion. |
| define half @test_movw3(i8 %x) { |
| ; X64-LABEL: test_movw3: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movsbl %dil, %eax |
| ; X64-NEXT: vmovw %eax, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test_movw3: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovw %eax, %xmm0 |
| ; X86-NEXT: retl |
| %z = sext i8 %x to i16 |
| %a = bitcast i16 %z to half |
| ret half %a |
| } |
| |
| define half @extract_f16_0(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_0: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 0 |
| ret half %res |
| } |
| |
| define half @extract_f16_1(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_1: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 1 |
| ret half %res |
| } |
| |
| define half @extract_f16_2(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_2: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 2 |
| ret half %res |
| } |
| |
| define half @extract_f16_3(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_3: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0 |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 3 |
| ret half %res |
| } |
| |
| define half @extract_f16_4(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_4: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 4 |
| ret half %res |
| } |
| |
| define half @extract_f16_5(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_5: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 5 |
| ret half %res |
| } |
| |
| define half @extract_f16_6(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_6: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 6 |
| ret half %res |
| } |
| |
| define half @extract_f16_7(<8 x half> %x) { |
| ; CHECK-LABEL: extract_f16_7: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x half> %x, i32 7 |
| ret half %res |
| } |
| |
| define i16 @extract_i16_0(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_0: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vmovw %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 0 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_1(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_1: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $1, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 1 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_2(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_2: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $2, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 2 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_3(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_3: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $3, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 3 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_4(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_4: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $4, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 4 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_5(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_5: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $5, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 5 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_6(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_6: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $6, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 6 |
| ret i16 %res |
| } |
| |
| define i16 @extract_i16_7(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_i16_7: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $7, %xmm0, %eax |
| ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 7 |
| ret i16 %res |
| } |
| |
| define void @extract_store_f16_0(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_0: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_0: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 0 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_1(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_1: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsrld $16, %xmm0, %xmm0 |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_1: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpsrld $16, %xmm0, %xmm0 |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 1 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_2(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_2: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_2: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 2 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_3(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_3: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsrlq $48, %xmm0, %xmm0 |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_3: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 3 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_4(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_4: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_4: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 4 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_5(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_5: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_5: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 5 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_6(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_6: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_6: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 6 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_f16_7(<8 x half> %x, half* %y) { |
| ; X64-LABEL: extract_store_f16_7: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; X64-NEXT: vmovsh %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_f16_7: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero |
| ; X86-NEXT: vmovsh %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x half> %x, i32 7 |
| store half %res, half* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_0(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_0: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $0, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_0: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $0, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 0 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_1(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_1: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $1, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_1: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $1, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 1 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_2(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_2: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $2, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_2: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $2, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 2 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_3(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_3: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $3, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_3: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $3, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 3 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_4(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_4: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $4, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_4: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $4, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 4 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_5(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_5: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $5, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_5: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $5, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 5 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_6(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_6: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $6, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_6: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $6, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 6 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define void @extract_store_i16_7(<8 x i16> %x, i16* %y) { |
| ; X64-LABEL: extract_store_i16_7: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpextrw $7, %xmm0, (%rdi) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: extract_store_i16_7: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vpextrw $7, %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %res = extractelement <8 x i16> %x, i32 7 |
| store i16 %res, i16* %y |
| ret void |
| } |
| |
| define i32 @extract_zext_i16_0(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_zext_i16_0: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $0, %xmm0, %eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 0 |
| %res2 = zext i16 %res to i32 |
| ret i32 %res2 |
| } |
| |
| define i32 @extract_zext_i16_1(<8 x i16> %x) { |
| ; CHECK-LABEL: extract_zext_i16_1: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpextrw $1, %xmm0, %eax |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = extractelement <8 x i16> %x, i32 1 |
| %res2 = zext i16 %res to i32 |
| ret i32 %res2 |
| } |
| |
| define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) { |
| ; X64-LABEL: build_vector_xxxxuuuu: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: build_vector_xxxxuuuu: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero |
| ; X86-NEXT: retl |
| %a = insertelement <8 x half> undef, half %a0, i32 0 |
| %b = insertelement <8 x half> %a, half %a1, i32 1 |
| %c = insertelement <8 x half> %b, half %a2, i32 2 |
| %d = insertelement <8 x half> %c, half %a3, i32 3 |
| ret <8 x half> %d |
| } |
| |
| define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) { |
| ; X64-LABEL: build_vector_uuuuxxxx: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] |
| ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: build_vector_uuuuxxxx: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; X86-NEXT: retl |
| %a = insertelement <8 x half> undef, half %a0, i32 4 |
| %b = insertelement <8 x half> %a, half %a1, i32 5 |
| %c = insertelement <8 x half> %b, half %a2, i32 6 |
| %d = insertelement <8 x half> %c, half %a3, i32 7 |
| ret <8 x half> %d |
| } |
| |
| define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { |
| ; X64-LABEL: build_vector_xxxxxxxx: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] |
| ; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] |
| ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: build_vector_xxxxxxxx: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] |
| ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] |
| ; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] |
| ; X86-NEXT: retl |
| %a = insertelement <8 x half> undef, half %a0, i32 0 |
| %b = insertelement <8 x half> %a, half %a1, i32 1 |
| %c = insertelement <8 x half> %b, half %a2, i32 2 |
| %d = insertelement <8 x half> %c, half %a3, i32 3 |
| %e = insertelement <8 x half> %d, half %a4, i32 4 |
| %f = insertelement <8 x half> %e, half %a5, i32 5 |
| %g = insertelement <8 x half> %f, half %a6, i32 6 |
| %h = insertelement <8 x half> %g, half %a7, i32 7 |
| ret <8 x half> %h |
| } |
| |
| define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { |
| ; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] |
| ; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] |
| ; X64-NEXT: vpbroadcastq %xmm1, %xmm1 |
| ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] |
| ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero |
| ; X86-NEXT: vpbroadcastq %xmm0, %xmm0 |
| ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 |
| ; X86-NEXT: retl |
| %a = insertelement <16 x half> undef, half %a0, i32 0 |
| %b = insertelement <16 x half> %a, half %a1, i32 1 |
| %c = insertelement <16 x half> %b, half %a2, i32 2 |
| %d = insertelement <16 x half> %c, half %a3, i32 3 |
| %e = insertelement <16 x half> %d, half %a4, i32 12 |
| %f = insertelement <16 x half> %e, half %a5, i32 13 |
| %g = insertelement <16 x half> %f, half %a6, i32 14 |
| %h = insertelement <16 x half> %g, half %a7, i32 15 |
| ret <16 x half> %h |
| } |
| |
| define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) { |
| ; CHECK-LABEL: regression1: |
| ; CHECK: # %bb.0: |
| ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] |
| ; CHECK-NEXT: ret{{[l|q]}} |
| %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5> |
| ret <8 x half> %res |
| } |
| |
| define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, i8* %4) { |
| ; X64-LABEL: regression2: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovw (%rsi), %xmm0 |
| ; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 |
| ; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] |
| ; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: regression2: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovw (%eax), %xmm0 |
| ; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero |
| ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 |
| ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] |
| ; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 |
| ; X86-NEXT: retl |
| %6 = getelementptr i8, i8* %4, i64 0 |
| %7 = getelementptr i8, i8* %6, i64 0 |
| %8 = getelementptr i8, i8* %7, i64 0 |
| %9 = load i8, i8* %8, align 1 |
| %10 = getelementptr i8, i8* %8, i64 1 |
| %11 = addrspacecast i8* %10 to i8 addrspace(4)* |
| %12 = load i8, i8 addrspace(4)* %11, align 1 |
| %13 = insertelement <2 x i8> poison, i8 %9, i32 0 |
| %14 = insertelement <2 x i8> %13, i8 %12, i32 1 |
| %15 = uitofp <2 x i8> %14 to <2 x float> |
| %16 = shufflevector <2 x float> %15, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> |
| %17 = shufflevector <4 x float> %16, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> |
| %18 = fmul contract <4 x float> %17, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> |
| ret <4 x float> %18 |
| } |
| |
| ; Make sure load/stores of v4f16 are handled well on 32-bit targets where |
| ; default widening legalization can't use i64. |
| define void @load_store_v4f16(<4 x half>* %x, <4 x half>* %y, <4 x half>* %z) { |
| ; X64-LABEL: load_store_v4f16: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; X64-NEXT: vaddph %xmm1, %xmm0, %xmm0 |
| ; X64-NEXT: vmovlps %xmm0, (%rdx) |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: load_store_v4f16: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx |
| ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero |
| ; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero |
| ; X86-NEXT: vaddph %xmm1, %xmm0, %xmm0 |
| ; X86-NEXT: vmovlps %xmm0, (%eax) |
| ; X86-NEXT: retl |
| %a = load <4 x half>, <4 x half>* %x |
| %b = load <4 x half>, <4 x half>* %y |
| %c = fadd <4 x half> %a, %b |
| store <4 x half> %c, <4 x half>* %z |
| ret void |
| } |
| |
| define <8 x half> @test21(half %a, half %b, half %c) nounwind { |
| ; X64-LABEL: test21: |
| ; X64: # %bb.0: |
| ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 |
| ; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 |
| ; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] |
| ; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; X64-NEXT: vpbroadcastw %xmm1, %xmm1 |
| ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test21: |
| ; X86: # %bb.0: |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] |
| ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 |
| ; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| ; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 |
| ; X86-NEXT: vpbroadcastw %xmm1, %xmm1 |
| ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] |
| ; X86-NEXT: retl |
| %1 = insertelement <8 x half> <half poison, half poison, half poison, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, half %a, i32 0 |
| %2 = insertelement <8 x half> %1, half %b, i32 1 |
| %3 = insertelement <8 x half> %2, half %c, i32 2 |
| ret <8 x half> %3 |
| } |
| |
| define <16 x i16> @test22(i16* %mem) nounwind { |
| ; X64-LABEL: test22: |
| ; X64: # %bb.0: |
| ; X64-NEXT: movzwl 0, %eax |
| ; X64-NEXT: andw (%rdi), %ax |
| ; X64-NEXT: vmovw %eax, %xmm0 |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: test22: |
| ; X86: # %bb.0: |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: movzwl 0, %ecx |
| ; X86-NEXT: andw (%eax), %cx |
| ; X86-NEXT: vmovw %ecx, %xmm0 |
| ; X86-NEXT: retl |
| %1 = load i16, i16* null, align 2 |
| %2 = load i16, i16* %mem, align 2 |
| %3 = and i16 %1, %2 |
| %4 = insertelement <16 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %3, i32 0 |
| ret <16 x i16> %4 |
| } |
| |
| define void @pr52560(i8 %0, <2 x i16> %1, i8* %c) nounwind { |
| ; X64-LABEL: pr52560: |
| ; X64: # %bb.0: # %entry |
| ; X64-NEXT: movsbl %dil, %eax |
| ; X64-NEXT: vmovw %eax, %xmm1 |
| ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 |
| ; X64-NEXT: vpcmpgtw %xmm2, %xmm1, %k1 |
| ; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} |
| ; X64-NEXT: vmovw %xmm0, %eax |
| ; X64-NEXT: testw %ax, %ax |
| ; X64-NEXT: je .LBB121_2 |
| ; X64-NEXT: # %bb.1: # %for.body.preheader |
| ; X64-NEXT: movb $0, (%rsi) |
| ; X64-NEXT: .LBB121_2: # %for.end |
| ; X64-NEXT: retq |
| ; |
| ; X86-LABEL: pr52560: |
| ; X86: # %bb.0: # %entry |
| ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: vmovw %eax, %xmm1 |
| ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 |
| ; X86-NEXT: vpcmpgtw %xmm2, %xmm1, %k1 |
| ; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} |
| ; X86-NEXT: vmovw %xmm0, %eax |
| ; X86-NEXT: testw %ax, %ax |
| ; X86-NEXT: je .LBB121_2 |
| ; X86-NEXT: # %bb.1: # %for.body.preheader |
| ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax |
| ; X86-NEXT: movb $0, (%eax) |
| ; X86-NEXT: .LBB121_2: # %for.end |
| ; X86-NEXT: retl |
| entry: |
| %conv = sext i8 %0 to i16 |
| %2 = insertelement <2 x i16> <i16 poison, i16 0>, i16 %conv, i32 0 |
| %3 = icmp sgt <2 x i16> %2, zeroinitializer |
| %4 = select <2 x i1> %3, <2 x i16> %1, <2 x i16> <i16 0, i16 poison> |
| %5 = extractelement <2 x i16> %4, i32 0 |
| %tobool.not14 = icmp eq i16 %5, 0 |
| br i1 %tobool.not14, label %for.end, label %for.body.preheader |
| |
| for.body.preheader: ; preds = %entry |
| store i8 0, i8* %c, align 1 |
| br label %for.end |
| |
| for.end: ; preds = %for.body.preheader, %entry |
| ret void |
| } |