| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+pclmul | FileCheck %s --check-prefixes=SSE |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+pclmul | FileCheck %s --check-prefixes=AVX,AVX-PCLMUL |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+vpclmulqdq | FileCheck %s --check-prefixes=AVX,AVX-VPCLMULQDQ,AVX2-VPCLMULQDQ |
| ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+vpclmulqdq | FileCheck %s --check-prefixes=AVX,AVX-VPCLMULQDQ,AVX512-VPCLMULQDQ |
| |
| ; PR176879 - Match PCLMULQDQ codegen with llvm.clmul intrinsic implementations |
| |
| define <2 x i64> @pclmul128_lo_hi(<2 x i64> %v0, <2 x i64> %v1) { |
| ; SSE-LABEL: pclmul128_lo_hi: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $16, %xmm1, %xmm0 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: pclmul128_lo_hi: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpclmulqdq $16, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| %i0 = zext i1 0 to i64 ; constant time lo/hi select |
| %i1 = zext i1 1 to i64 ; constant time lo/hi select |
| %a0 = extractelement <2 x i64> %v0, i64 %i0 |
| %a1 = extractelement <2 x i64> %v1, i64 %i1 |
| %x0 = zext i64 %a0 to i128 |
| %x1 = zext i64 %a1 to i128 |
| %cl = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1) |
| %r = bitcast i128 %cl to <2 x i64> |
| ret <2 x i64> %r |
| } |
| |
| define <2 x i64> @pclmul128_hi_hi(<2 x i64> %v0, <2 x i64> %v1) { |
| ; SSE-LABEL: pclmul128_hi_hi: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $17, %xmm1, %xmm0 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: pclmul128_hi_hi: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| %i0 = zext i1 1 to i64 ; constant time lo/hi select |
| %i1 = zext i1 1 to i64 ; constant time lo/hi select |
| %a0 = extractelement <2 x i64> %v0, i64 %i0 |
| %a1 = extractelement <2 x i64> %v1, i64 %i1 |
| %x0 = zext i64 %a0 to i128 |
| %x1 = zext i64 %a1 to i128 |
| %cl = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1) |
| %r = bitcast i128 %cl to <2 x i64> |
| ret <2 x i64> %r |
| } |
| |
| define <2 x i64> @pclmul128_hi_lo_vector(<2 x i64> %a0, <2 x i64> %a1) { |
| ; SSE-LABEL: pclmul128_hi_lo_vector: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $1, %xmm1, %xmm0 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-LABEL: pclmul128_hi_lo_vector: |
| ; AVX: # %bb.0: |
| ; AVX-NEXT: vpclmulqdq $1, %xmm1, %xmm0, %xmm0 |
| ; AVX-NEXT: retq |
| %s0 = shufflevector <2 x i64> %a0, <2 x i64> poison, <1 x i32> <i32 1> |
| %s1 = shufflevector <2 x i64> %a1, <2 x i64> poison, <1 x i32> <i32 0> |
| %x0 = zext <1 x i64> %s0 to <1 x i128> |
| %x1 = zext <1 x i64> %s1 to <1 x i128> |
| %clmul = call <1 x i128> @llvm.clmul.v1i128(<1 x i128> %x0, <1 x i128> %x1) |
| %res = bitcast <1 x i128> %clmul to <2 x i64> |
| ret <2 x i64>%res |
| } |
| |
| define <4 x i64> @pclmul256_lo_lo(<4 x i64> %v0, <4 x i64> %v1) { |
| ; SSE-LABEL: pclmul256_lo_lo: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $0, %xmm2, %xmm0 |
| ; SSE-NEXT: pclmulqdq $0, %xmm3, %xmm1 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-PCLMUL-LABEL: pclmul256_lo_lo: |
| ; AVX-PCLMUL: # %bb.0: |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: retq |
| ; |
| ; AVX-VPCLMULQDQ-LABEL: pclmul256_lo_lo: |
| ; AVX-VPCLMULQDQ: # %bb.0: |
| ; AVX-VPCLMULQDQ-NEXT: vpclmulqdq $0, %ymm1, %ymm0, %ymm0 |
| ; AVX-VPCLMULQDQ-NEXT: retq |
| %i0 = zext i1 0 to i64 ; constant time lo/hi select |
| %i1 = zext i1 0 to i64 ; constant time lo/hi select |
| %i2 = add i64 %i0, 2 |
| %i3 = add i64 %i1, 2 |
| %a0 = extractelement <4 x i64> %v0, i64 %i0 |
| %a1 = extractelement <4 x i64> %v1, i64 %i1 |
| %a2 = extractelement <4 x i64> %v0, i64 %i2 |
| %a3 = extractelement <4 x i64> %v1, i64 %i3 |
| %x0 = zext i64 %a0 to i128 |
| %x1 = zext i64 %a1 to i128 |
| %x2 = zext i64 %a2 to i128 |
| %x3 = zext i64 %a3 to i128 |
| %c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1) |
| %c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3) |
| %r0 = bitcast i128 %c0 to <2 x i64> |
| %r1 = bitcast i128 %c1 to <2 x i64> |
| %r = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| ret <4 x i64> %r |
| } |
| |
| define <4 x i64> @pclmul256_lo_hi(<4 x i64> %v0, <4 x i64> %v1) { |
| ; SSE-LABEL: pclmul256_lo_hi: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $16, %xmm2, %xmm0 |
| ; SSE-NEXT: pclmulqdq $16, %xmm3, %xmm1 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-PCLMUL-LABEL: pclmul256_lo_hi: |
| ; AVX-PCLMUL: # %bb.0: |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm3, %xmm2, %xmm2 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm1, %xmm0, %xmm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: retq |
| ; |
| ; AVX-VPCLMULQDQ-LABEL: pclmul256_lo_hi: |
| ; AVX-VPCLMULQDQ: # %bb.0: |
| ; AVX-VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0 |
| ; AVX-VPCLMULQDQ-NEXT: retq |
| %i0 = zext i1 0 to i64 ; constant time lo/hi select |
| %i1 = zext i1 1 to i64 ; constant time lo/hi select |
| %i2 = add i64 %i0, 2 |
| %i3 = add i64 %i1, 2 |
| %a0 = extractelement <4 x i64> %v0, i64 %i0 |
| %a1 = extractelement <4 x i64> %v1, i64 %i1 |
| %a2 = extractelement <4 x i64> %v0, i64 %i2 |
| %a3 = extractelement <4 x i64> %v1, i64 %i3 |
| %x0 = zext i64 %a0 to i128 |
| %x1 = zext i64 %a1 to i128 |
| %x2 = zext i64 %a2 to i128 |
| %x3 = zext i64 %a3 to i128 |
| %c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1) |
| %c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3) |
| %r0 = bitcast i128 %c0 to <2 x i64> |
| %r1 = bitcast i128 %c1 to <2 x i64> |
| %r = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| ret <4 x i64> %r |
| } |
| |
| define <4 x i64> @pclmul256_hi_hi_vector(<4 x i64> %a0, <4 x i64> %a1) { |
| ; SSE-LABEL: pclmul256_hi_hi_vector: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $17, %xmm2, %xmm0 |
| ; SSE-NEXT: pclmulqdq $17, %xmm3, %xmm1 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-PCLMUL-LABEL: pclmul256_hi_hi_vector: |
| ; AVX-PCLMUL: # %bb.0: |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm2 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: retq |
| ; |
| ; AVX-VPCLMULQDQ-LABEL: pclmul256_hi_hi_vector: |
| ; AVX-VPCLMULQDQ: # %bb.0: |
| ; AVX-VPCLMULQDQ-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0 |
| ; AVX-VPCLMULQDQ-NEXT: retq |
| %s0 = shufflevector <4 x i64> %a0, <4 x i64> poison, <2 x i32> <i32 1, i32 3> |
| %s1 = shufflevector <4 x i64> %a1, <4 x i64> poison, <2 x i32> <i32 1, i32 3> |
| %x0 = zext <2 x i64> %s0 to <2 x i128> |
| %x1 = zext <2 x i64> %s1 to <2 x i128> |
| %clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %x0, <2 x i128> %x1) |
| %res = bitcast <2 x i128> %clmul to <4 x i64> |
| ret <4 x i64>%res |
| } |
| |
| define <8 x i64> @pclmul512_lo_hi(<8 x i64> %v0, <8 x i64> %v1) { |
| ; SSE-LABEL: pclmul512_lo_hi: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $16, %xmm4, %xmm0 |
| ; SSE-NEXT: pclmulqdq $16, %xmm5, %xmm1 |
| ; SSE-NEXT: pclmulqdq $16, %xmm6, %xmm2 |
| ; SSE-NEXT: pclmulqdq $16, %xmm7, %xmm3 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-PCLMUL-LABEL: pclmul512_lo_hi: |
| ; AVX-PCLMUL: # %bb.0: |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm4 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm2, %xmm5 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm5, %xmm4, %xmm4 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm5 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm3, %xmm6 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm6, %xmm5, %xmm5 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm2, %xmm0, %xmm0 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm3, %xmm1, %xmm1 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 |
| ; AVX-PCLMUL-NEXT: retq |
| ; |
| ; AVX2-VPCLMULQDQ-LABEL: pclmul512_lo_hi: |
| ; AVX2-VPCLMULQDQ: # %bb.0: |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm2, %ymm0, %ymm0 |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm3, %ymm1, %ymm1 |
| ; AVX2-VPCLMULQDQ-NEXT: retq |
| ; |
| ; AVX512-VPCLMULQDQ-LABEL: pclmul512_lo_hi: |
| ; AVX512-VPCLMULQDQ: # %bb.0: |
| ; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $16, %zmm1, %zmm0, %zmm0 |
| ; AVX512-VPCLMULQDQ-NEXT: retq |
| %i0 = zext i1 0 to i64 ; constant time lo/hi select |
| %i1 = zext i1 1 to i64 ; constant time lo/hi select |
| %i2 = add i64 %i0, 2 |
| %i3 = add i64 %i1, 2 |
| %i4 = add i64 %i2, 2 |
| %i5 = add i64 %i3, 2 |
| %i6 = add i64 %i4, 2 |
| %i7 = add i64 %i5, 2 |
| %a0 = extractelement <8 x i64> %v0, i64 %i0 |
| %a1 = extractelement <8 x i64> %v1, i64 %i1 |
| %a2 = extractelement <8 x i64> %v0, i64 %i2 |
| %a3 = extractelement <8 x i64> %v1, i64 %i3 |
| %a4 = extractelement <8 x i64> %v0, i64 %i4 |
| %a5 = extractelement <8 x i64> %v1, i64 %i5 |
| %a6 = extractelement <8 x i64> %v0, i64 %i6 |
| %a7 = extractelement <8 x i64> %v1, i64 %i7 |
| %x0 = zext i64 %a0 to i128 |
| %x1 = zext i64 %a1 to i128 |
| %x2 = zext i64 %a2 to i128 |
| %x3 = zext i64 %a3 to i128 |
| %x4 = zext i64 %a4 to i128 |
| %x5 = zext i64 %a5 to i128 |
| %x6 = zext i64 %a6 to i128 |
| %x7 = zext i64 %a7 to i128 |
| %c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1) |
| %c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3) |
| %c2 = call i128 @llvm.clmul.i128(i128 %x4, i128 %x5) |
| %c3 = call i128 @llvm.clmul.i128(i128 %x6, i128 %x7) |
| %r0 = bitcast i128 %c0 to <2 x i64> |
| %r1 = bitcast i128 %c1 to <2 x i64> |
| %r2 = bitcast i128 %c2 to <2 x i64> |
| %r3 = bitcast i128 %c3 to <2 x i64> |
| %r01 = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %r23 = shufflevector <2 x i64> %r2, <2 x i64> %r3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %r = shufflevector <4 x i64> %r01, <4 x i64> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| ret <8 x i64> %r |
| } |
| |
| define <8 x i64> @pclmul512_hi_lo(<8 x i64> %v0, <8 x i64> %v1) { |
| ; SSE-LABEL: pclmul512_hi_lo: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $1, %xmm4, %xmm0 |
| ; SSE-NEXT: pclmulqdq $1, %xmm5, %xmm1 |
| ; SSE-NEXT: pclmulqdq $1, %xmm6, %xmm2 |
| ; SSE-NEXT: pclmulqdq $1, %xmm7, %xmm3 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-PCLMUL-LABEL: pclmul512_hi_lo: |
| ; AVX-PCLMUL: # %bb.0: |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm4 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm2, %xmm5 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm5, %xmm4, %xmm4 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm5 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm3, %xmm6 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm6, %xmm5, %xmm5 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm2, %xmm0, %xmm0 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm3, %xmm1, %xmm1 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 |
| ; AVX-PCLMUL-NEXT: retq |
| ; |
| ; AVX2-VPCLMULQDQ-LABEL: pclmul512_hi_lo: |
| ; AVX2-VPCLMULQDQ: # %bb.0: |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $1, %ymm2, %ymm0, %ymm0 |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $1, %ymm3, %ymm1, %ymm1 |
| ; AVX2-VPCLMULQDQ-NEXT: retq |
| ; |
| ; AVX512-VPCLMULQDQ-LABEL: pclmul512_hi_lo: |
| ; AVX512-VPCLMULQDQ: # %bb.0: |
| ; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $1, %zmm1, %zmm0, %zmm0 |
| ; AVX512-VPCLMULQDQ-NEXT: retq |
| %i0 = zext i1 1 to i64 ; constant time lo/hi select |
| %i1 = zext i1 0 to i64 ; constant time lo/hi select |
| %i2 = add i64 %i0, 2 |
| %i3 = add i64 %i1, 2 |
| %i4 = add i64 %i2, 2 |
| %i5 = add i64 %i3, 2 |
| %i6 = add i64 %i4, 2 |
| %i7 = add i64 %i5, 2 |
| %a0 = extractelement <8 x i64> %v0, i64 %i0 |
| %a1 = extractelement <8 x i64> %v1, i64 %i1 |
| %a2 = extractelement <8 x i64> %v0, i64 %i2 |
| %a3 = extractelement <8 x i64> %v1, i64 %i3 |
| %a4 = extractelement <8 x i64> %v0, i64 %i4 |
| %a5 = extractelement <8 x i64> %v1, i64 %i5 |
| %a6 = extractelement <8 x i64> %v0, i64 %i6 |
| %a7 = extractelement <8 x i64> %v1, i64 %i7 |
| %x0 = zext i64 %a0 to i128 |
| %x1 = zext i64 %a1 to i128 |
| %x2 = zext i64 %a2 to i128 |
| %x3 = zext i64 %a3 to i128 |
| %x4 = zext i64 %a4 to i128 |
| %x5 = zext i64 %a5 to i128 |
| %x6 = zext i64 %a6 to i128 |
| %x7 = zext i64 %a7 to i128 |
| %c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1) |
| %c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3) |
| %c2 = call i128 @llvm.clmul.i128(i128 %x4, i128 %x5) |
| %c3 = call i128 @llvm.clmul.i128(i128 %x6, i128 %x7) |
| %r0 = bitcast i128 %c0 to <2 x i64> |
| %r1 = bitcast i128 %c1 to <2 x i64> |
| %r2 = bitcast i128 %c2 to <2 x i64> |
| %r3 = bitcast i128 %c3 to <2 x i64> |
| %r01 = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %r23 = shufflevector <2 x i64> %r2, <2 x i64> %r3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| %r = shufflevector <4 x i64> %r01, <4 x i64> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| ret <8 x i64> %r |
| } |
| |
| define <8 x i64> @pclmul512_lo_lo(<8 x i64> %a0, <8 x i64> %a1) { |
| ; SSE-LABEL: pclmul512_lo_lo: |
| ; SSE: # %bb.0: |
| ; SSE-NEXT: pclmulqdq $0, %xmm4, %xmm0 |
| ; SSE-NEXT: pclmulqdq $0, %xmm5, %xmm1 |
| ; SSE-NEXT: pclmulqdq $0, %xmm6, %xmm2 |
| ; SSE-NEXT: pclmulqdq $0, %xmm7, %xmm3 |
| ; SSE-NEXT: retq |
| ; |
| ; AVX-PCLMUL-LABEL: pclmul512_lo_lo: |
| ; AVX-PCLMUL: # %bb.0: |
| ; AVX-PCLMUL-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] |
| ; AVX-PCLMUL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm3[2,3] |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 |
| ; AVX-PCLMUL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2 |
| ; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm4 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm3 |
| ; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 |
| ; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm1 |
| ; AVX-PCLMUL-NEXT: retq |
| ; |
| ; AVX2-VPCLMULQDQ-LABEL: pclmul512_lo_lo: |
| ; AVX2-VPCLMULQDQ: # %bb.0: |
| ; AVX2-VPCLMULQDQ-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] |
| ; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| ; AVX2-VPCLMULQDQ-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2] |
| ; AVX2-VPCLMULQDQ-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm3[2,3] |
| ; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 |
| ; AVX2-VPCLMULQDQ-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] |
| ; AVX2-VPCLMULQDQ-NEXT: vextractf128 $1, %ymm0, %xmm2 |
| ; AVX2-VPCLMULQDQ-NEXT: vextractf128 $1, %ymm1, %xmm3 |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm4 |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2 |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm3 |
| ; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 |
| ; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 |
| ; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm1 |
| ; AVX2-VPCLMULQDQ-NEXT: retq |
| ; |
| ; AVX512-VPCLMULQDQ-LABEL: pclmul512_lo_lo: |
| ; AVX512-VPCLMULQDQ: # %bb.0: |
| ; AVX512-VPCLMULQDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 |
| ; AVX512-VPCLMULQDQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] |
| ; AVX512-VPCLMULQDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] |
| ; AVX512-VPCLMULQDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 |
| ; AVX512-VPCLMULQDQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] |
| ; AVX512-VPCLMULQDQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] |
| ; AVX512-VPCLMULQDQ-NEXT: vextracti128 $1, %ymm0, %xmm2 |
| ; AVX512-VPCLMULQDQ-NEXT: vextracti128 $1, %ymm1, %xmm3 |
| ; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm4 |
| ; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 |
| ; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm1 |
| ; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2 |
| ; AVX512-VPCLMULQDQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 |
| ; AVX512-VPCLMULQDQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 |
| ; AVX512-VPCLMULQDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 |
| ; AVX512-VPCLMULQDQ-NEXT: retq |
| %s0 = shufflevector <8 x i64> %a0, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> |
| %s1 = shufflevector <8 x i64> %a1, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> |
| %x0 = zext <4 x i64> %s0 to <4 x i128> |
| %x1 = zext <4 x i64> %s1 to <4 x i128> |
| %clmul = call <4 x i128> @llvm.clmul.v4i128(<4 x i128> %x0, <4 x i128> %x1) |
| %res = bitcast <4 x i128> %clmul to <8 x i64> |
| ret <8 x i64>%res |
| } |