| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2 |
| ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4 |
| ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2 |
| ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 |
| |
| define i32 @dot_ext_v8i8_v8i32(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v8i8_v8i32: |
| ; SSE2: # %bb.0: # %entry |
| ; SSE2-NEXT: pushq %rbx |
| ; SSE2-NEXT: movzbl (%rdi), %eax |
| ; SSE2-NEXT: movzbl (%rdi,%rsi), %ecx |
| ; SSE2-NEXT: movzbl (%rdi,%rsi,2), %r8d |
| ; SSE2-NEXT: leaq (%rsi,%rsi,2), %r9 |
| ; SSE2-NEXT: movzbl (%rdi,%r9), %r10d |
| ; SSE2-NEXT: movzbl (%rdi,%rsi,4), %r11d |
| ; SSE2-NEXT: leaq (%rsi,%rsi,4), %rbx |
| ; SSE2-NEXT: movzbl (%rdi,%rbx), %ebx |
| ; SSE2-NEXT: movzbl (%rdi,%r9,2), %r9d |
| ; SSE2-NEXT: leaq (%rdi,%rsi,8), %rdi |
| ; SSE2-NEXT: subq %rsi, %rdi |
| ; SSE2-NEXT: movzbl (%rdi), %esi |
| ; SSE2-NEXT: shll $16, %ecx |
| ; SSE2-NEXT: orl %eax, %ecx |
| ; SSE2-NEXT: movd %ecx, %xmm0 |
| ; SSE2-NEXT: pinsrw $2, %r8d, %xmm0 |
| ; SSE2-NEXT: pinsrw $3, %r10d, %xmm0 |
| ; SSE2-NEXT: pinsrw $4, %r11d, %xmm0 |
| ; SSE2-NEXT: pinsrw $5, %ebx, %xmm0 |
| ; SSE2-NEXT: pinsrw $6, %r9d, %xmm0 |
| ; SSE2-NEXT: pinsrw $7, %esi, %xmm0 |
| ; SSE2-NEXT: movdqu (%rdx), %xmm1 |
| ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: paddd %xmm1, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; SSE2-NEXT: paddd %xmm0, %xmm1 |
| ; SSE2-NEXT: movd %xmm1, %eax |
| ; SSE2-NEXT: popq %rbx |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v8i8_v8i32: |
| ; SSE4: # %bb.0: # %entry |
| ; SSE4-NEXT: movzbl (%rdi), %eax |
| ; SSE4-NEXT: leaq (%rsi,%rsi,4), %rcx |
| ; SSE4-NEXT: leaq (%rsi,%rsi,2), %r8 |
| ; SSE4-NEXT: leaq (%rdi,%rsi,8), %r9 |
| ; SSE4-NEXT: subq %rsi, %r9 |
| ; SSE4-NEXT: movd %eax, %xmm0 |
| ; SSE4-NEXT: pinsrb $2, (%rdi,%rsi), %xmm0 |
| ; SSE4-NEXT: pinsrb $4, (%rdi,%rsi,2), %xmm0 |
| ; SSE4-NEXT: pinsrb $6, (%rdi,%r8), %xmm0 |
| ; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,4), %xmm0 |
| ; SSE4-NEXT: pinsrb $10, (%rdi,%rcx), %xmm0 |
| ; SSE4-NEXT: pinsrb $12, (%rdi,%r8,2), %xmm0 |
| ; SSE4-NEXT: pinsrb $14, (%r9), %xmm0 |
| ; SSE4-NEXT: movdqu (%rdx), %xmm1 |
| ; SSE4-NEXT: pmaddwd %xmm0, %xmm1 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE4-NEXT: paddd %xmm1, %xmm0 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; SSE4-NEXT: paddd %xmm0, %xmm1 |
| ; SSE4-NEXT: movd %xmm1, %eax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX-LABEL: dot_ext_v8i8_v8i32: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: movzbl (%rdi), %eax |
| ; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx |
| ; AVX-NEXT: leaq (%rsi,%rsi,4), %r8 |
| ; AVX-NEXT: leaq (%rdi,%rsi,8), %r9 |
| ; AVX-NEXT: subq %rsi, %r9 |
| ; AVX-NEXT: vmovd %eax, %xmm0 |
| ; AVX-NEXT: vpinsrb $2, (%rdi,%rsi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $4, (%rdi,%rsi,2), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $6, (%rdi,%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,4), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $10, (%rdi,%r8), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $12, (%rdi,%rcx,2), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $14, (%r9), %xmm0, %xmm0 |
| ; AVX-NEXT: vpmaddwd (%rdx), %xmm0, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovd %xmm0, %eax |
| ; AVX-NEXT: retq |
| entry: |
| %var0 = load i8, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i8, ptr %arrayidx.1, align 1 |
| %mul.2 = shl nsw i64 %a_stride, 1 |
| %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 |
| %var2 = load i8, ptr %arrayidx.2, align 1 |
| %mul.3 = mul nsw i64 %a_stride, 3 |
| %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 |
| %var3 = load i8, ptr %arrayidx.3, align 1 |
| %mul.4 = shl nsw i64 %a_stride, 2 |
| %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 %mul.4 |
| %var4 = load i8, ptr %arrayidx.4, align 1 |
| %mul.5 = mul nsw i64 %a_stride, 5 |
| %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 %mul.5 |
| %var5 = load i8, ptr %arrayidx.5, align 1 |
| %mul.6 = mul nsw i64 %a_stride, 6 |
| %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 %mul.6 |
| %var6 = load i8, ptr %arrayidx.6, align 1 |
| %mul.7 = mul nsw i64 %a_stride, 7 |
| %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 %mul.7 |
| %var7 = load i8, ptr %arrayidx.7, align 1 |
| %var8 = insertelement <8 x i8> poison, i8 %var0, i64 0 |
| %var9 = insertelement <8 x i8> %var8, i8 %var1, i64 1 |
| %var10 = insertelement <8 x i8> %var9, i8 %var2, i64 2 |
| %var11 = insertelement <8 x i8> %var10, i8 %var3, i64 3 |
| %var12 = insertelement <8 x i8> %var11, i8 %var4, i64 4 |
| %var13 = insertelement <8 x i8> %var12, i8 %var5, i64 5 |
| %var14 = insertelement <8 x i8> %var13, i8 %var6, i64 6 |
| %var15 = insertelement <8 x i8> %var14, i8 %var7, i64 7 |
| %var16 = zext <8 x i8> %var15 to <8 x i32> |
| %var17 = load <8 x i16>, ptr %b, align 2 |
| %var18 = sext <8 x i16> %var17 to <8 x i32> |
| %var19 = mul nsw <8 x i32> %var18, %var16 |
| %var20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %var19) |
| ret i32 %var20 |
| } |
| |
| define i32 @dot_ext_v4i8_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v4i8_v4i32: |
| ; SSE2: # %bb.0: # %entry |
| ; SSE2-NEXT: movzbl (%rdi), %eax |
| ; SSE2-NEXT: movzbl (%rdi,%rsi), %ecx |
| ; SSE2-NEXT: movzbl (%rdi,%rsi,2), %r8d |
| ; SSE2-NEXT: leaq (%rsi,%rsi,2), %rsi |
| ; SSE2-NEXT: movd %eax, %xmm0 |
| ; SSE2-NEXT: pinsrw $2, %ecx, %xmm0 |
| ; SSE2-NEXT: pinsrw $4, %r8d, %xmm0 |
| ; SSE2-NEXT: movzbl (%rdi,%rsi), %eax |
| ; SSE2-NEXT: pinsrw $6, %eax, %xmm0 |
| ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero |
| ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] |
| ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: paddd %xmm1, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; SSE2-NEXT: paddd %xmm0, %xmm1 |
| ; SSE2-NEXT: movd %xmm1, %eax |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v4i8_v4i32: |
| ; SSE4: # %bb.0: # %entry |
| ; SSE4-NEXT: movzbl (%rdi), %eax |
| ; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx |
| ; SSE4-NEXT: movd %eax, %xmm0 |
| ; SSE4-NEXT: pinsrb $4, (%rdi,%rsi), %xmm0 |
| ; SSE4-NEXT: pinsrb $8, (%rdi,%rsi,2), %xmm0 |
| ; SSE4-NEXT: pinsrb $12, (%rdi,%rcx), %xmm0 |
| ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| ; SSE4-NEXT: pmaddwd %xmm0, %xmm1 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE4-NEXT: paddd %xmm1, %xmm0 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; SSE4-NEXT: paddd %xmm0, %xmm1 |
| ; SSE4-NEXT: movd %xmm1, %eax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX-LABEL: dot_ext_v4i8_v4i32: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: movzbl (%rdi), %eax |
| ; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx |
| ; AVX-NEXT: vmovd %eax, %xmm0 |
| ; AVX-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $8, (%rdi,%rsi,2), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrb $12, (%rdi,%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero |
| ; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovd %xmm0, %eax |
| ; AVX-NEXT: retq |
| entry: |
| %var0 = load i8, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i8, ptr %arrayidx.1, align 1 |
| %mul.2 = shl nsw i64 %a_stride, 1 |
| %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 |
| %var2 = load i8, ptr %arrayidx.2, align 1 |
| %mul.3 = mul nsw i64 %a_stride, 3 |
| %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 |
| %var3 = load i8, ptr %arrayidx.3, align 1 |
| %var8 = insertelement <4 x i8> poison, i8 %var0, i64 0 |
| %var9 = insertelement <4 x i8> %var8, i8 %var1, i64 1 |
| %var10 = insertelement <4 x i8> %var9, i8 %var2, i64 2 |
| %var11 = insertelement <4 x i8> %var10, i8 %var3, i64 3 |
| %var16 = zext <4 x i8> %var11 to <4 x i32> |
| %var17 = load <4 x i16>, ptr %b, align 2 |
| %var18 = sext <4 x i16> %var17 to <4 x i32> |
| %var19 = mul nsw <4 x i32> %var18, %var16 |
| %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19) |
| ret i32 %var20 |
| } |
| |
| define i32 @dot_ext_v2i8_v2i32(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v2i8_v2i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movzbl (%rdi), %eax |
| ; SSE2-NEXT: movzbl (%rdi,%rsi), %ecx |
| ; SSE2-NEXT: movd %eax, %xmm0 |
| ; SSE2-NEXT: pinsrw $2, %ecx, %xmm0 |
| ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] |
| ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] |
| ; SSE2-NEXT: paddd %xmm1, %xmm0 |
| ; SSE2-NEXT: movd %xmm0, %eax |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v2i8_v2i32: |
| ; SSE4: # %bb.0: |
| ; SSE4-NEXT: movzbl (%rdi), %eax |
| ; SSE4-NEXT: movd %eax, %xmm0 |
| ; SSE4-NEXT: pinsrb $4, (%rdi,%rsi), %xmm0 |
| ; SSE4-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero |
| ; SSE4-NEXT: pmaddwd %xmm0, %xmm1 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] |
| ; SSE4-NEXT: paddd %xmm1, %xmm0 |
| ; SSE4-NEXT: movd %xmm0, %eax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX-LABEL: dot_ext_v2i8_v2i32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: movzbl (%rdi), %eax |
| ; AVX-NEXT: vmovd %eax, %xmm0 |
| ; AVX-NEXT: vpinsrb $4, (%rdi,%rsi), %xmm0, %xmm0 |
| ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero |
| ; AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovd %xmm0, %eax |
| ; AVX-NEXT: retq |
| %var0 = load i8, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i8, ptr %arrayidx.1, align 1 |
| %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0 |
| %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1 |
| %var16 = zext <2 x i8> %var9 to <2 x i32> |
| %var17 = load <2 x i16>, ptr %b, align 2 |
| %var18 = sext <2 x i16> %var17 to <2 x i32> |
| %var19 = mul nsw <2 x i32> %var18, %var16 |
| %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19) |
| ret i32 %var20 |
| } |
| |
| define i64 @dot_ext_v2i8_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v2i8_v2i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movzbl (%rdi), %eax |
| ; SSE2-NEXT: movzbl (%rdi,%rsi), %ecx |
| ; SSE2-NEXT: movd %eax, %xmm0 |
| ; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 |
| ; SSE2-NEXT: movzwl (%rdx), %eax |
| ; SSE2-NEXT: movd %eax, %xmm1 |
| ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] |
| ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] |
| ; SSE2-NEXT: pxor %xmm2, %xmm2 |
| ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 |
| ; SSE2-NEXT: psrad $24, %xmm1 |
| ; SSE2-NEXT: pmuludq %xmm0, %xmm1 |
| ; SSE2-NEXT: pmuludq %xmm0, %xmm2 |
| ; SSE2-NEXT: psllq $32, %xmm2 |
| ; SSE2-NEXT: paddq %xmm1, %xmm2 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] |
| ; SSE2-NEXT: paddq %xmm2, %xmm0 |
| ; SSE2-NEXT: movq %xmm0, %rax |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v2i8_v2i64: |
| ; SSE4: # %bb.0: |
| ; SSE4-NEXT: movzbl (%rdi), %eax |
| ; SSE4-NEXT: movd %eax, %xmm0 |
| ; SSE4-NEXT: pinsrb $8, (%rdi,%rsi), %xmm0 |
| ; SSE4-NEXT: pmovsxbq (%rdx), %xmm1 |
| ; SSE4-NEXT: pmuldq %xmm0, %xmm1 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE4-NEXT: paddq %xmm1, %xmm0 |
| ; SSE4-NEXT: movq %xmm0, %rax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX-LABEL: dot_ext_v2i8_v2i64: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: movzbl (%rdi), %eax |
| ; AVX-NEXT: vmovd %eax, %xmm0 |
| ; AVX-NEXT: vpinsrb $8, (%rdi,%rsi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpmovsxbq (%rdx), %xmm1 |
| ; AVX-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovq %xmm0, %rax |
| ; AVX-NEXT: retq |
| %var0 = load i8, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i8, ptr %arrayidx.1, align 1 |
| %var8 = insertelement <2 x i8> poison, i8 %var0, i64 0 |
| %var9 = insertelement <2 x i8> %var8, i8 %var1, i64 1 |
| %var16 = zext <2 x i8> %var9 to <2 x i64> |
| %var17 = load <2 x i8>, ptr %b, align 2 |
| %var18 = sext <2 x i8> %var17 to <2 x i64> |
| %var19 = mul nsw <2 x i64> %var18, %var16 |
| %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19) |
| ret i64 %var20 |
| } |
| |
| define i32 @dot_ext_v4i16_v4i32(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v4i16_v4i32: |
| ; SSE2: # %bb.0: # %entry |
| ; SSE2-NEXT: movzwl (%rdi), %eax |
| ; SSE2-NEXT: movd %eax, %xmm0 |
| ; SSE2-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 |
| ; SSE2-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0 |
| ; SSE2-NEXT: leaq (%rsi,%rsi,2), %rax |
| ; SSE2-NEXT: movdqa %xmm0, %xmm1 |
| ; SSE2-NEXT: pinsrw $6, (%rdi,%rax), %xmm1 |
| ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero |
| ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] |
| ; SSE2-NEXT: psrad $16, %xmm2 |
| ; SSE2-NEXT: pmuludq %xmm2, %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] |
| ; SSE2-NEXT: pmuludq %xmm1, %xmm2 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] |
| ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE2-NEXT: paddd %xmm0, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] |
| ; SSE2-NEXT: paddd %xmm1, %xmm0 |
| ; SSE2-NEXT: movd %xmm0, %eax |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v4i16_v4i32: |
| ; SSE4: # %bb.0: # %entry |
| ; SSE4-NEXT: movzwl (%rdi), %eax |
| ; SSE4-NEXT: leaq (%rsi,%rsi,2), %rcx |
| ; SSE4-NEXT: movd %eax, %xmm0 |
| ; SSE4-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 |
| ; SSE4-NEXT: pinsrw $4, (%rdi,%rsi,2), %xmm0 |
| ; SSE4-NEXT: pinsrw $6, (%rdi,%rcx), %xmm0 |
| ; SSE4-NEXT: pmovsxwd (%rdx), %xmm1 |
| ; SSE4-NEXT: pmulld %xmm0, %xmm1 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE4-NEXT: paddd %xmm1, %xmm0 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; SSE4-NEXT: paddd %xmm0, %xmm1 |
| ; SSE4-NEXT: movd %xmm1, %eax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX-LABEL: dot_ext_v4i16_v4i32: |
| ; AVX: # %bb.0: # %entry |
| ; AVX-NEXT: movzwl (%rdi), %eax |
| ; AVX-NEXT: leaq (%rsi,%rsi,2), %rcx |
| ; AVX-NEXT: vmovd %eax, %xmm0 |
| ; AVX-NEXT: vpinsrw $2, (%rdi,%rsi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrw $4, (%rdi,%rsi,2), %xmm0, %xmm0 |
| ; AVX-NEXT: vpinsrw $6, (%rdi,%rcx), %xmm0, %xmm0 |
| ; AVX-NEXT: vpmovsxwd (%rdx), %xmm1 |
| ; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovd %xmm0, %eax |
| ; AVX-NEXT: retq |
| entry: |
| %var0 = load i16, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i16, ptr %arrayidx.1, align 1 |
| %mul.2 = shl nsw i64 %a_stride, 1 |
| %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 %mul.2 |
| %var2 = load i16, ptr %arrayidx.2, align 1 |
| %mul.3 = mul nsw i64 %a_stride, 3 |
| %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 %mul.3 |
| %var3 = load i16, ptr %arrayidx.3, align 1 |
| %var8 = insertelement <4 x i16> poison, i16 %var0, i64 0 |
| %var9 = insertelement <4 x i16> %var8, i16 %var1, i64 1 |
| %var10 = insertelement <4 x i16> %var9, i16 %var2, i64 2 |
| %var11 = insertelement <4 x i16> %var10, i16 %var3, i64 3 |
| %var16 = zext <4 x i16> %var11 to <4 x i32> |
| %var17 = load <4 x i16>, ptr %b, align 2 |
| %var18 = sext <4 x i16> %var17 to <4 x i32> |
| %var19 = mul nsw <4 x i32> %var18, %var16 |
| %var20 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %var19) |
| ret i32 %var20 |
| } |
| |
| define i32 @dot_ext_v2i16_v2i32(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v2i16_v2i32: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movzwl (%rdi), %eax |
| ; SSE2-NEXT: movd %eax, %xmm0 |
| ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] |
| ; SSE2-NEXT: psrad $16, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] |
| ; SSE2-NEXT: pmuludq %xmm0, %xmm1 |
| ; SSE2-NEXT: pinsrw $2, (%rdi,%rsi), %xmm0 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] |
| ; SSE2-NEXT: pmuludq %xmm0, %xmm2 |
| ; SSE2-NEXT: paddd %xmm2, %xmm1 |
| ; SSE2-NEXT: movd %xmm1, %eax |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v2i16_v2i32: |
| ; SSE4: # %bb.0: |
| ; SSE4-NEXT: movzwl (%rdi), %eax |
| ; SSE4-NEXT: movd %eax, %xmm0 |
| ; SSE4-NEXT: pinsrw $1, (%rdi,%rsi), %xmm0 |
| ; SSE4-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero |
| ; SSE4-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE4-NEXT: pmovsxwd %xmm1, %xmm1 |
| ; SSE4-NEXT: pmulld %xmm0, %xmm1 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] |
| ; SSE4-NEXT: paddd %xmm1, %xmm0 |
| ; SSE4-NEXT: movd %xmm0, %eax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX-LABEL: dot_ext_v2i16_v2i32: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: movzwl (%rdi), %eax |
| ; AVX-NEXT: vmovd %eax, %xmm0 |
| ; AVX-NEXT: vpinsrw $1, (%rdi,%rsi), %xmm0, %xmm0 |
| ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero |
| ; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; AVX-NEXT: vpmovsxwd %xmm1, %xmm1 |
| ; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 |
| ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] |
| ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: vmovd %xmm0, %eax |
| ; AVX-NEXT: retq |
| %var0 = load i16, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i16, ptr %arrayidx.1, align 1 |
| %var8 = insertelement <2 x i16> poison, i16 %var0, i64 0 |
| %var9 = insertelement <2 x i16> %var8, i16 %var1, i64 1 |
| %var16 = zext <2 x i16> %var9 to <2 x i32> |
| %var17 = load <2 x i16>, ptr %b, align 2 |
| %var18 = sext <2 x i16> %var17 to <2 x i32> |
| %var19 = mul nsw <2 x i32> %var18, %var16 |
| %var20 = tail call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %var19) |
| ret i32 %var20 |
| } |
| |
| define i64 @dot_ext_v2i32_v2i64(ptr %a, i64 %a_stride, ptr %b) nounwind { |
| ; SSE2-LABEL: dot_ext_v2i32_v2i64: |
| ; SSE2: # %bb.0: |
| ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] |
| ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] |
| ; SSE2-NEXT: pxor %xmm2, %xmm2 |
| ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 |
| ; SSE2-NEXT: pmuludq %xmm1, %xmm2 |
| ; SSE2-NEXT: psllq $32, %xmm2 |
| ; SSE2-NEXT: pmuludq %xmm0, %xmm1 |
| ; SSE2-NEXT: paddq %xmm2, %xmm1 |
| ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] |
| ; SSE2-NEXT: paddq %xmm1, %xmm0 |
| ; SSE2-NEXT: movq %xmm0, %rax |
| ; SSE2-NEXT: retq |
| ; |
| ; SSE4-LABEL: dot_ext_v2i32_v2i64: |
| ; SSE4: # %bb.0: |
| ; SSE4-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; SSE4-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] |
| ; SSE4-NEXT: pmovsxdq (%rdx), %xmm0 |
| ; SSE4-NEXT: movdqa %xmm0, %xmm2 |
| ; SSE4-NEXT: pmuludq %xmm1, %xmm2 |
| ; SSE4-NEXT: psrlq $32, %xmm0 |
| ; SSE4-NEXT: pmuludq %xmm1, %xmm0 |
| ; SSE4-NEXT: psllq $32, %xmm0 |
| ; SSE4-NEXT: paddq %xmm2, %xmm0 |
| ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; SSE4-NEXT: paddq %xmm0, %xmm1 |
| ; SSE4-NEXT: movq %xmm1, %rax |
| ; SSE4-NEXT: retq |
| ; |
| ; AVX2-LABEL: dot_ext_v2i32_v2i64: |
| ; AVX2: # %bb.0: |
| ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| ; AVX2-NEXT: vpmovsxdq (%rdx), %xmm1 |
| ; AVX2-NEXT: vpmuludq %xmm0, %xmm1, %xmm2 |
| ; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 |
| ; AVX2-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 |
| ; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 |
| ; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 |
| ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 |
| ; AVX2-NEXT: vmovq %xmm0, %rax |
| ; AVX2-NEXT: retq |
| ; |
| ; AVX512-LABEL: dot_ext_v2i32_v2i64: |
| ; AVX512: # %bb.0: |
| ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero |
| ; AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero |
| ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] |
| ; AVX512-NEXT: vpmovsxdq (%rdx), %xmm1 |
| ; AVX512-NEXT: vpmullq %xmm0, %xmm1, %xmm0 |
| ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] |
| ; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 |
| ; AVX512-NEXT: vmovq %xmm0, %rax |
| ; AVX512-NEXT: retq |
| %var0 = load i32, ptr %a, align 1 |
| %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 %a_stride |
| %var1 = load i32, ptr %arrayidx.1, align 1 |
| %var8 = insertelement <2 x i32> poison, i32 %var0, i64 0 |
| %var9 = insertelement <2 x i32> %var8, i32 %var1, i64 1 |
| %var16 = zext <2 x i32> %var9 to <2 x i64> |
| %var17 = load <2 x i32>, ptr %b, align 2 |
| %var18 = sext <2 x i32> %var17 to <2 x i64> |
| %var19 = mul nsw <2 x i64> %var18, %var16 |
| %var20 = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %var19) |
| ret i64 %var20 |
| } |
| |
| declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) |
| declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) |
| declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) |
| declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) |