blob: 92df2d9987dbe734d663b12a5ec9b94c3f626b00 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+pclmul | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+pclmul | FileCheck %s --check-prefixes=AVX,AVX-PCLMUL
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx,+vpclmulqdq | FileCheck %s --check-prefixes=AVX,AVX-VPCLMULQDQ,AVX2-VPCLMULQDQ
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+vpclmulqdq | FileCheck %s --check-prefixes=AVX,AVX-VPCLMULQDQ,AVX512-VPCLMULQDQ
; PR176879 - Match PCLMULQDQ codegen with llvm.clmul intrinsic implementations
define <2 x i64> @pclmul128_lo_hi(<2 x i64> %v0, <2 x i64> %v1) {
; SSE-LABEL: pclmul128_lo_hi:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $16, %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: pclmul128_lo_hi:
; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $16, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%i0 = zext i1 0 to i64 ; constant time lo/hi select
%i1 = zext i1 1 to i64 ; constant time lo/hi select
%a0 = extractelement <2 x i64> %v0, i64 %i0
%a1 = extractelement <2 x i64> %v1, i64 %i1
%x0 = zext i64 %a0 to i128
%x1 = zext i64 %a1 to i128
%cl = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1)
%r = bitcast i128 %cl to <2 x i64>
ret <2 x i64> %r
}
define <2 x i64> @pclmul128_hi_hi(<2 x i64> %v0, <2 x i64> %v1) {
; SSE-LABEL: pclmul128_hi_hi:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $17, %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: pclmul128_hi_hi:
; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%i0 = zext i1 1 to i64 ; constant time lo/hi select
%i1 = zext i1 1 to i64 ; constant time lo/hi select
%a0 = extractelement <2 x i64> %v0, i64 %i0
%a1 = extractelement <2 x i64> %v1, i64 %i1
%x0 = zext i64 %a0 to i128
%x1 = zext i64 %a1 to i128
%cl = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1)
%r = bitcast i128 %cl to <2 x i64>
ret <2 x i64> %r
}
define <2 x i64> @pclmul128_hi_lo_vector(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: pclmul128_hi_lo_vector:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $1, %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: pclmul128_hi_lo_vector:
; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $1, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%s0 = shufflevector <2 x i64> %a0, <2 x i64> poison, <1 x i32> <i32 1>
%s1 = shufflevector <2 x i64> %a1, <2 x i64> poison, <1 x i32> <i32 0>
%x0 = zext <1 x i64> %s0 to <1 x i128>
%x1 = zext <1 x i64> %s1 to <1 x i128>
%clmul = call <1 x i128> @llvm.clmul.v1i128(<1 x i128> %x0, <1 x i128> %x1)
%res = bitcast <1 x i128> %clmul to <2 x i64>
ret <2 x i64>%res
}
define <4 x i64> @pclmul256_lo_lo(<4 x i64> %v0, <4 x i64> %v1) {
; SSE-LABEL: pclmul256_lo_lo:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $0, %xmm2, %xmm0
; SSE-NEXT: pclmulqdq $0, %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-PCLMUL-LABEL: pclmul256_lo_lo:
; AVX-PCLMUL: # %bb.0:
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2
; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: retq
;
; AVX-VPCLMULQDQ-LABEL: pclmul256_lo_lo:
; AVX-VPCLMULQDQ: # %bb.0:
; AVX-VPCLMULQDQ-NEXT: vpclmulqdq $0, %ymm1, %ymm0, %ymm0
; AVX-VPCLMULQDQ-NEXT: retq
%i0 = zext i1 0 to i64 ; constant time lo/hi select
%i1 = zext i1 0 to i64 ; constant time lo/hi select
%i2 = add i64 %i0, 2
%i3 = add i64 %i1, 2
%a0 = extractelement <4 x i64> %v0, i64 %i0
%a1 = extractelement <4 x i64> %v1, i64 %i1
%a2 = extractelement <4 x i64> %v0, i64 %i2
%a3 = extractelement <4 x i64> %v1, i64 %i3
%x0 = zext i64 %a0 to i128
%x1 = zext i64 %a1 to i128
%x2 = zext i64 %a2 to i128
%x3 = zext i64 %a3 to i128
%c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1)
%c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3)
%r0 = bitcast i128 %c0 to <2 x i64>
%r1 = bitcast i128 %c1 to <2 x i64>
%r = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %r
}
define <4 x i64> @pclmul256_lo_hi(<4 x i64> %v0, <4 x i64> %v1) {
; SSE-LABEL: pclmul256_lo_hi:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $16, %xmm2, %xmm0
; SSE-NEXT: pclmulqdq $16, %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-PCLMUL-LABEL: pclmul256_lo_hi:
; AVX-PCLMUL: # %bb.0:
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm3, %xmm2, %xmm2
; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm1, %xmm0, %xmm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: retq
;
; AVX-VPCLMULQDQ-LABEL: pclmul256_lo_hi:
; AVX-VPCLMULQDQ: # %bb.0:
; AVX-VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0
; AVX-VPCLMULQDQ-NEXT: retq
%i0 = zext i1 0 to i64 ; constant time lo/hi select
%i1 = zext i1 1 to i64 ; constant time lo/hi select
%i2 = add i64 %i0, 2
%i3 = add i64 %i1, 2
%a0 = extractelement <4 x i64> %v0, i64 %i0
%a1 = extractelement <4 x i64> %v1, i64 %i1
%a2 = extractelement <4 x i64> %v0, i64 %i2
%a3 = extractelement <4 x i64> %v1, i64 %i3
%x0 = zext i64 %a0 to i128
%x1 = zext i64 %a1 to i128
%x2 = zext i64 %a2 to i128
%x3 = zext i64 %a3 to i128
%c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1)
%c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3)
%r0 = bitcast i128 %c0 to <2 x i64>
%r1 = bitcast i128 %c1 to <2 x i64>
%r = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %r
}
define <4 x i64> @pclmul256_hi_hi_vector(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-LABEL: pclmul256_hi_hi_vector:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $17, %xmm2, %xmm0
; SSE-NEXT: pclmulqdq $17, %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-PCLMUL-LABEL: pclmul256_hi_hi_vector:
; AVX-PCLMUL: # %bb.0:
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm2
; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: retq
;
; AVX-VPCLMULQDQ-LABEL: pclmul256_hi_hi_vector:
; AVX-VPCLMULQDQ: # %bb.0:
; AVX-VPCLMULQDQ-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0
; AVX-VPCLMULQDQ-NEXT: retq
%s0 = shufflevector <4 x i64> %a0, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
%s1 = shufflevector <4 x i64> %a1, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
%x0 = zext <2 x i64> %s0 to <2 x i128>
%x1 = zext <2 x i64> %s1 to <2 x i128>
%clmul = call <2 x i128> @llvm.clmul.v2i128(<2 x i128> %x0, <2 x i128> %x1)
%res = bitcast <2 x i128> %clmul to <4 x i64>
ret <4 x i64>%res
}
define <8 x i64> @pclmul512_lo_hi(<8 x i64> %v0, <8 x i64> %v1) {
; SSE-LABEL: pclmul512_lo_hi:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $16, %xmm4, %xmm0
; SSE-NEXT: pclmulqdq $16, %xmm5, %xmm1
; SSE-NEXT: pclmulqdq $16, %xmm6, %xmm2
; SSE-NEXT: pclmulqdq $16, %xmm7, %xmm3
; SSE-NEXT: retq
;
; AVX-PCLMUL-LABEL: pclmul512_lo_hi:
; AVX-PCLMUL: # %bb.0:
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm5, %xmm4, %xmm4
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm3, %xmm6
; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm6, %xmm5, %xmm5
; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm2, %xmm0, %xmm0
; AVX-PCLMUL-NEXT: vpclmulqdq $16, %xmm3, %xmm1, %xmm1
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX-PCLMUL-NEXT: retq
;
; AVX2-VPCLMULQDQ-LABEL: pclmul512_lo_hi:
; AVX2-VPCLMULQDQ: # %bb.0:
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm2, %ymm0, %ymm0
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm3, %ymm1, %ymm1
; AVX2-VPCLMULQDQ-NEXT: retq
;
; AVX512-VPCLMULQDQ-LABEL: pclmul512_lo_hi:
; AVX512-VPCLMULQDQ: # %bb.0:
; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $16, %zmm1, %zmm0, %zmm0
; AVX512-VPCLMULQDQ-NEXT: retq
%i0 = zext i1 0 to i64 ; constant time lo/hi select
%i1 = zext i1 1 to i64 ; constant time lo/hi select
%i2 = add i64 %i0, 2
%i3 = add i64 %i1, 2
%i4 = add i64 %i2, 2
%i5 = add i64 %i3, 2
%i6 = add i64 %i4, 2
%i7 = add i64 %i5, 2
%a0 = extractelement <8 x i64> %v0, i64 %i0
%a1 = extractelement <8 x i64> %v1, i64 %i1
%a2 = extractelement <8 x i64> %v0, i64 %i2
%a3 = extractelement <8 x i64> %v1, i64 %i3
%a4 = extractelement <8 x i64> %v0, i64 %i4
%a5 = extractelement <8 x i64> %v1, i64 %i5
%a6 = extractelement <8 x i64> %v0, i64 %i6
%a7 = extractelement <8 x i64> %v1, i64 %i7
%x0 = zext i64 %a0 to i128
%x1 = zext i64 %a1 to i128
%x2 = zext i64 %a2 to i128
%x3 = zext i64 %a3 to i128
%x4 = zext i64 %a4 to i128
%x5 = zext i64 %a5 to i128
%x6 = zext i64 %a6 to i128
%x7 = zext i64 %a7 to i128
%c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1)
%c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3)
%c2 = call i128 @llvm.clmul.i128(i128 %x4, i128 %x5)
%c3 = call i128 @llvm.clmul.i128(i128 %x6, i128 %x7)
%r0 = bitcast i128 %c0 to <2 x i64>
%r1 = bitcast i128 %c1 to <2 x i64>
%r2 = bitcast i128 %c2 to <2 x i64>
%r3 = bitcast i128 %c3 to <2 x i64>
%r01 = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%r23 = shufflevector <2 x i64> %r2, <2 x i64> %r3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%r = shufflevector <4 x i64> %r01, <4 x i64> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %r
}
define <8 x i64> @pclmul512_hi_lo(<8 x i64> %v0, <8 x i64> %v1) {
; SSE-LABEL: pclmul512_hi_lo:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $1, %xmm4, %xmm0
; SSE-NEXT: pclmulqdq $1, %xmm5, %xmm1
; SSE-NEXT: pclmulqdq $1, %xmm6, %xmm2
; SSE-NEXT: pclmulqdq $1, %xmm7, %xmm3
; SSE-NEXT: retq
;
; AVX-PCLMUL-LABEL: pclmul512_hi_lo:
; AVX-PCLMUL: # %bb.0:
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm2, %xmm5
; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm5, %xmm4, %xmm4
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm3, %xmm6
; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm6, %xmm5, %xmm5
; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm2, %xmm0, %xmm0
; AVX-PCLMUL-NEXT: vpclmulqdq $1, %xmm3, %xmm1, %xmm1
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX-PCLMUL-NEXT: retq
;
; AVX2-VPCLMULQDQ-LABEL: pclmul512_hi_lo:
; AVX2-VPCLMULQDQ: # %bb.0:
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $1, %ymm2, %ymm0, %ymm0
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $1, %ymm3, %ymm1, %ymm1
; AVX2-VPCLMULQDQ-NEXT: retq
;
; AVX512-VPCLMULQDQ-LABEL: pclmul512_hi_lo:
; AVX512-VPCLMULQDQ: # %bb.0:
; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $1, %zmm1, %zmm0, %zmm0
; AVX512-VPCLMULQDQ-NEXT: retq
%i0 = zext i1 1 to i64 ; constant time lo/hi select
%i1 = zext i1 0 to i64 ; constant time lo/hi select
%i2 = add i64 %i0, 2
%i3 = add i64 %i1, 2
%i4 = add i64 %i2, 2
%i5 = add i64 %i3, 2
%i6 = add i64 %i4, 2
%i7 = add i64 %i5, 2
%a0 = extractelement <8 x i64> %v0, i64 %i0
%a1 = extractelement <8 x i64> %v1, i64 %i1
%a2 = extractelement <8 x i64> %v0, i64 %i2
%a3 = extractelement <8 x i64> %v1, i64 %i3
%a4 = extractelement <8 x i64> %v0, i64 %i4
%a5 = extractelement <8 x i64> %v1, i64 %i5
%a6 = extractelement <8 x i64> %v0, i64 %i6
%a7 = extractelement <8 x i64> %v1, i64 %i7
%x0 = zext i64 %a0 to i128
%x1 = zext i64 %a1 to i128
%x2 = zext i64 %a2 to i128
%x3 = zext i64 %a3 to i128
%x4 = zext i64 %a4 to i128
%x5 = zext i64 %a5 to i128
%x6 = zext i64 %a6 to i128
%x7 = zext i64 %a7 to i128
%c0 = call i128 @llvm.clmul.i128(i128 %x0, i128 %x1)
%c1 = call i128 @llvm.clmul.i128(i128 %x2, i128 %x3)
%c2 = call i128 @llvm.clmul.i128(i128 %x4, i128 %x5)
%c3 = call i128 @llvm.clmul.i128(i128 %x6, i128 %x7)
%r0 = bitcast i128 %c0 to <2 x i64>
%r1 = bitcast i128 %c1 to <2 x i64>
%r2 = bitcast i128 %c2 to <2 x i64>
%r3 = bitcast i128 %c3 to <2 x i64>
%r01 = shufflevector <2 x i64> %r0, <2 x i64> %r1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%r23 = shufflevector <2 x i64> %r2, <2 x i64> %r3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%r = shufflevector <4 x i64> %r01, <4 x i64> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %r
}
define <8 x i64> @pclmul512_lo_lo(<8 x i64> %a0, <8 x i64> %a1) {
; SSE-LABEL: pclmul512_lo_lo:
; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $0, %xmm4, %xmm0
; SSE-NEXT: pclmulqdq $0, %xmm5, %xmm1
; SSE-NEXT: pclmulqdq $0, %xmm6, %xmm2
; SSE-NEXT: pclmulqdq $0, %xmm7, %xmm3
; SSE-NEXT: retq
;
; AVX-PCLMUL-LABEL: pclmul512_lo_lo:
; AVX-PCLMUL: # %bb.0:
; AVX-PCLMUL-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
; AVX-PCLMUL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm3[2,3]
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX-PCLMUL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-PCLMUL-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm4
; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2
; AVX-PCLMUL-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm3
; AVX-PCLMUL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-PCLMUL-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm1
; AVX-PCLMUL-NEXT: retq
;
; AVX2-VPCLMULQDQ-LABEL: pclmul512_lo_lo:
; AVX2-VPCLMULQDQ: # %bb.0:
; AVX2-VPCLMULQDQ-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-VPCLMULQDQ-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
; AVX2-VPCLMULQDQ-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm3[2,3]
; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-VPCLMULQDQ-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; AVX2-VPCLMULQDQ-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-VPCLMULQDQ-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm4
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm3
; AVX2-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX2-VPCLMULQDQ-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm1
; AVX2-VPCLMULQDQ-NEXT: retq
;
; AVX512-VPCLMULQDQ-LABEL: pclmul512_lo_lo:
; AVX512-VPCLMULQDQ: # %bb.0:
; AVX512-VPCLMULQDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512-VPCLMULQDQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX512-VPCLMULQDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-VPCLMULQDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512-VPCLMULQDQ-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
; AVX512-VPCLMULQDQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX512-VPCLMULQDQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-VPCLMULQDQ-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm4
; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $17, %xmm3, %xmm2, %xmm1
; AVX512-VPCLMULQDQ-NEXT: vpclmulqdq $0, %xmm3, %xmm2, %xmm2
; AVX512-VPCLMULQDQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512-VPCLMULQDQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
; AVX512-VPCLMULQDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-VPCLMULQDQ-NEXT: retq
%s0 = shufflevector <8 x i64> %a0, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%s1 = shufflevector <8 x i64> %a1, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%x0 = zext <4 x i64> %s0 to <4 x i128>
%x1 = zext <4 x i64> %s1 to <4 x i128>
%clmul = call <4 x i128> @llvm.clmul.v4i128(<4 x i128> %x0, <4 x i128> %x1)
%res = bitcast <4 x i128> %clmul to <8 x i64>
ret <8 x i64>%res
}