blob: 3ece4beb9c22e4d0b14dbc41ae116ae4eb568d19 [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX,AVXIFMA
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512-NOIFMA
; 67108863 == (1 << 26) - 1
; 4503599627370496 == (1 << 52)
; 4503599627370495 == (1 << 52) - 1
define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVXIFMA-LABEL: test_512_combine:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863]
; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm0
; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
; AVXIFMA-NEXT: vmovdqa %ymm4, %ymm0
; AVXIFMA-NEXT: vmovdqa %ymm5, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_512_combine:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
; AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_512_combine:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm4
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm5
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm4, %ymm5
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <8 x i64> %x, splat (i64 67108863)
%y_masked = and <8 x i64> %y, splat (i64 67108863)
%mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
%res = add nuw nsw <8 x i64> %mul, %z
ret <8 x i64> %res
}
define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVXIFMA-LABEL: test_512_combine_v2:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3]
; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623]
; AVXIFMA-NEXT: vpand %ymm7, %ymm0, %ymm0
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm0
; AVXIFMA-NEXT: vpand %ymm7, %ymm1, %ymm1
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5
; AVXIFMA-NEXT: vmovdqa %ymm4, %ymm0
; AVXIFMA-NEXT: vmovdqa %ymm5, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_512_combine_v2:
; AVX512: # %bb.0:
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_512_combine_v2:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm4, %ymm5, %ymm3
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1
%y_masked = and <8 x i64> %y, splat (i64 3)
%mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
%res = add nuw nsw <8 x i64> %mul, %z
ret <8 x i64> %res
}
define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVXIFMA-LABEL: test_512_no_combine:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495]
; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm7
; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm8
; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm9
; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm6
; AVXIFMA-NEXT: vpsrlq $32, %ymm8, %ymm8
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm8, %ymm8
; AVXIFMA-NEXT: vpsrlq $32, %ymm6, %ymm6
; AVXIFMA-NEXT: vpmuludq %ymm6, %ymm1, %ymm6
; AVXIFMA-NEXT: vpaddq %ymm6, %ymm8, %ymm6
; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
; AVXIFMA-NEXT: vpsrlq $32, %ymm7, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
; AVXIFMA-NEXT: vpsrlq $32, %ymm9, %ymm7
; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm0, %ymm7
; AVXIFMA-NEXT: vpaddq %ymm3, %ymm7, %ymm3
; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm4, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_512_no_combine:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
; AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm4
; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm3
; AVX512-NEXT: vpsrlq $32, %zmm4, %zmm4
; AVX512-NEXT: vpmuludq %zmm1, %zmm4, %zmm4
; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm3
; AVX512-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
; AVX512-NEXT: vpaddq %zmm4, %zmm3, %zmm3
; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_512_no_combine:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495]
; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm0, %zmm4
; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm1, %zmm3
; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm4, %zmm4
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm4, %zmm4
; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm3, %zmm3
; AVX512-NOIFMA-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
; AVX512-NOIFMA-NEXT: vpaddq %zmm4, %zmm3, %zmm3
; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm3, %zmm3
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <8 x i64> %x, splat (i64 4503599627370495)
%y_masked = and <8 x i64> %y, splat (i64 4503599627370495)
%mul = mul nuw nsw <8 x i64> %x_masked, %y_masked
%res = add nuw nsw <8 x i64> %mul, %z
ret <8 x i64> %res
}
define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVXIFMA-LABEL: test_512_no_combine_v2:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpsrlq $32, %ymm1, %ymm6
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm6, %ymm6
; AVXIFMA-NEXT: vpsrlq $32, %ymm3, %ymm7
; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm1, %ymm7
; AVXIFMA-NEXT: vpaddq %ymm6, %ymm7, %ymm6
; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
; AVXIFMA-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
; AVXIFMA-NEXT: vpsrlq $32, %ymm2, %ymm7
; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm0, %ymm7
; AVXIFMA-NEXT: vpaddq %ymm3, %ymm7, %ymm3
; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm4, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_512_no_combine_v2:
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512-NEXT: vpmuludq %zmm1, %zmm3, %zmm3
; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm4
; AVX512-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
; AVX512-NEXT: vpaddq %zmm3, %zmm4, %zmm3
; AVX512-NEXT: vpsllq $32, %zmm3, %zmm3
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_512_no_combine_v2:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm3
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm3, %zmm3
; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm1, %zmm4
; AVX512-NOIFMA-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm4, %zmm3
; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm3, %zmm3
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: retq
%mul = mul <8 x i64> %x, %y
%res = add <8 x i64> %mul, %z
ret <8 x i64> %res
}
define <4 x i64> @test_256_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
; AVX-LABEL: test_256_combine:
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
; AVX-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2
; AVX-NEXT: vmovdqa %ymm2, %ymm0
; AVX-NEXT: retq
;
; AVX512-NOVL-LABEL: test_256_combine:
; AVX512-NOVL: # %bb.0:
; AVX512-NOVL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
; AVX512-NOVL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512-NOVL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512-NOVL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
; AVX512-NOVL-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512-NOVL-NEXT: retq
;
; AVX512VL-LABEL: test_256_combine:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpmadd52luq %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
%x_masked = and <4 x i64> %x, splat(i64 67108863)
%y_masked = and <4 x i64> %y, splat(i64 67108863)
%mul = mul nuw nsw <4 x i64> %x_masked, %y_masked
%res = add nuw nsw <4 x i64> %z, %mul
ret <4 x i64> %res
}
define <4 x i64> @test_256_no_combine(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
; X64-LABEL: test_256_no_combine:
; X64: # %bb.0:
; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
; X64-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
; X64-NEXT: vpsrlq $32, %ymm1, %ymm4
; X64-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
; X64-NEXT: vpaddq %ymm3, %ymm4, %ymm3
; X64-NEXT: vpsllq $32, %ymm3, %ymm3
; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; X64-NEXT: retq
%mul = mul <4 x i64> %x, %y
%res = add <4 x i64> %mul, %z
ret <4 x i64> %res
}
define <2 x i64> @test_128_combine(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
; AVX-LABEL: test_128_combine:
; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX-NEXT: {vex} vpmadd52luq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa %xmm2, %xmm0
; AVX-NEXT: retq
;
; AVX512-NOVL-LABEL: test_128_combine:
; AVX512-NOVL: # %bb.0:
; AVX512-NOVL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
; AVX512-NOVL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512-NOVL-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX512-NOVL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
; AVX512-NOVL-NEXT: vpaddq %xmm0, %xmm2, %xmm0
; AVX512-NOVL-NEXT: retq
;
; AVX512VL-LABEL: test_128_combine:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [67108863,67108863]
; AVX512VL-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX512VL-NEXT: vpmadd52luq %xmm1, %xmm0, %xmm2
; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0
; AVX512VL-NEXT: retq
%x_masked = and <2 x i64> %x, splat (i64 67108863)
%y_masked = and <2 x i64> %y, splat (i64 67108863)
%mul = mul <2 x i64> %x_masked, %y_masked
%res = add <2 x i64> %z, %mul
ret <2 x i64> %res
}
; Sanity check we're not applying this here
define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
; X64-LABEL: test_scalar_no_ifma:
; X64: # %bb.0:
; X64-NEXT: imulq %rsi, %rdi
; X64-NEXT: leaq (%rdi,%rdx), %rax
; X64-NEXT: retq
%mul = mul <1 x i64> %x, %y
%res = add <1 x i64> %mul, %z
ret <1 x i64> %res
}
; 40-bit and 13-bit, too wide
define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
; AVXIFMA-LABEL: test_mixed_width_too_wide:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191]
; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052]
; AVXIFMA-NEXT: vpshufb %ymm6, %ymm1, %ymm7
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm7, %ymm7
; AVXIFMA-NEXT: vpsllq $32, %ymm7, %ymm7
; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
; AVXIFMA-NEXT: vpshufb %ymm6, %ymm0, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3
; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm1, %ymm5, %ymm1
; AVXIFMA-NEXT: vpaddq %ymm7, %ymm1, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_mixed_width_too_wide:
; AVX512: # %bb.0:
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm3
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpsllq $32, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm1
; AVX512-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_mixed_width_too_wide:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm3
; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm2, %zmm1
; AVX512-NOIFMA-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x40 = and <8 x i64> %x, splat (i64 1099511627775)
%y13 = and <8 x i64> %y, splat (i64 8191)
%mul = mul <8 x i64> %x40, %y13
%res = add <8 x i64> %z, %mul
ret <8 x i64> %res
}
define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) {
; AVXIFMA-LABEL: test_zext32_inputs_not_safe:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVXIFMA-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVXIFMA-NEXT: vpmuludq %ymm5, %ymm4, %ymm4
; AVXIFMA-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVXIFMA-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
; AVXIFMA-NEXT: vpaddq %ymm4, %ymm2, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm1, %ymm3, %ymm1
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_zext32_inputs_not_safe:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_zext32_inputs_not_safe:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; AVX512-NOIFMA-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x = zext <8 x i32> %xi32 to <8 x i64>
%y = zext <8 x i32> %yi32 to <8 x i64>
%mul = mul <8 x i64> %x, %y
%res = add <8 x i64> %z, %mul
ret <8 x i64> %res
}
define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind {
; AVXIFMA-LABEL: test_1024_combine_split:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: pushq %rbp
; AVXIFMA-NEXT: movq %rsp, %rbp
; AVXIFMA-NEXT: andq $-32, %rsp
; AVXIFMA-NEXT: subq $32, %rsp
; AVXIFMA-NEXT: vmovdqa 112(%rbp), %ymm8
; AVXIFMA-NEXT: vmovdqa 80(%rbp), %ymm9
; AVXIFMA-NEXT: vmovdqa 48(%rbp), %ymm10
; AVXIFMA-NEXT: vmovdqa 16(%rbp), %ymm11
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863]
; AVXIFMA-NEXT: vpand %ymm3, %ymm12, %ymm3
; AVXIFMA-NEXT: vpand %ymm2, %ymm12, %ymm2
; AVXIFMA-NEXT: vpand %ymm1, %ymm12, %ymm1
; AVXIFMA-NEXT: vpand %ymm0, %ymm12, %ymm0
; AVXIFMA-NEXT: vpand %ymm7, %ymm12, %ymm7
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm7, %ymm3, %ymm8
; AVXIFMA-NEXT: vpand %ymm6, %ymm12, %ymm3
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm2, %ymm9
; AVXIFMA-NEXT: vpand %ymm5, %ymm12, %ymm2
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm10
; AVXIFMA-NEXT: vpand %ymm4, %ymm12, %ymm1
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm11
; AVXIFMA-NEXT: vmovdqa %ymm11, %ymm0
; AVXIFMA-NEXT: vmovdqa %ymm10, %ymm1
; AVXIFMA-NEXT: vmovdqa %ymm9, %ymm2
; AVXIFMA-NEXT: vmovdqa %ymm8, %ymm3
; AVXIFMA-NEXT: movq %rbp, %rsp
; AVXIFMA-NEXT: popq %rbp
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_1024_combine_split:
; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
; AVX512-NEXT: vpandq %zmm6, %zmm2, %zmm2
; AVX512-NEXT: vpandq %zmm6, %zmm0, %zmm0
; AVX512-NEXT: vpmadd52luq %zmm2, %zmm0, %zmm4
; AVX512-NEXT: vpandq %zmm6, %zmm3, %zmm0
; AVX512-NEXT: vpandq %zmm6, %zmm1, %zmm1
; AVX512-NEXT: vpmadd52luq %zmm0, %zmm1, %zmm5
; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_1024_combine_split:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm3, %zmm3
; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm2, %zmm2
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm6
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm7
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm4, %ymm8
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm6, %ymm7, %ymm8
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm5, %ymm6
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm4, %ymm6
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm1, %ymm5
; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm1
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <16 x i64> %x, splat (i64 67108863)
%y_masked = and <16 x i64> %y, splat (i64 67108863)
%mul = mul <16 x i64> %x_masked, %y_masked
%res = add <16 x i64> %z, %mul
ret <16 x i64> %res
}
define <1 x i64> @test_not_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) {
; X64-LABEL: test_not_v1i64:
; X64: # %bb.0:
; X64-NEXT: andl $67108863, %edi # imm = 0x3FFFFFF
; X64-NEXT: imulq %rdi, %rdi
; X64-NEXT: leaq (%rdi,%rdx), %rax
; X64-NEXT: retq
%x_masked = and <1 x i64> %x, splat (i64 67108863)
%y_masked = and <1 x i64> %x, splat (i64 67108863)
%mul = mul <1 x i64> %x_masked, %y_masked
%res = add <1 x i64> %mul, %z
ret <1 x i64> %res
}
define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) {
; AVXIFMA-LABEL: test_v3i64:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
; AVXIFMA-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVXIFMA-NEXT: vpmuludq %ymm0, %ymm0, %ymm0
; AVXIFMA-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVXIFMA-NEXT: retq
;
; AVX512-NOVL-LABEL: test_v3i64:
; AVX512-NOVL: # %bb.0:
; AVX512-NOVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863]
; AVX512-NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NOVL-NEXT: vpmuludq %ymm0, %ymm0, %ymm0
; AVX512-NOVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512-NOVL-NEXT: retq
;
; AVX512VL-LABEL: test_v3i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; AVX512VL-NEXT: vpmuludq %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_v3i64:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
; AVX512-NOIFMA-NEXT: vpmuludq %ymm0, %ymm0, %ymm0
; AVX512-NOIFMA-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <3 x i64> %x, splat (i64 67108863)
%y_masked = and <3 x i64> %x, splat (i64 67108863)
%mul = mul <3 x i64> %x_masked, %y_masked
%res = add <3 x i64> %mul, %z
ret <3 x i64> %res
}
define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) {
; AVXIFMA-LABEL: test_v5i64:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: movq %rdi, %rax
; AVXIFMA-NEXT: vmovq %r8, %xmm0
; AVXIFMA-NEXT: vmovq %rcx, %xmm1
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVXIFMA-NEXT: vmovq %rdx, %xmm1
; AVXIFMA-NEXT: vmovq %rsi, %xmm2
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVXIFMA-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863]
; AVXIFMA-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVXIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF
; AVXIFMA-NEXT: vmovq %rcx, %xmm3
; AVXIFMA-NEXT: vmovq %r9, %xmm4
; AVXIFMA-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVXIFMA-NEXT: vpsrlq $32, %xmm3, %xmm4
; AVXIFMA-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
; AVXIFMA-NEXT: vpsllq $33, %xmm4, %xmm4
; AVXIFMA-NEXT: vpmuludq %xmm3, %xmm3, %xmm3
; AVXIFMA-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; AVXIFMA-NEXT: vpaddq %xmm4, %xmm1, %xmm1
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
; AVXIFMA-NEXT: vmovdqa %ymm2, (%rdi)
; AVXIFMA-NEXT: vmovq %xmm1, 32(%rdi)
; AVXIFMA-NEXT: vzeroupper
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_v5i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpmuludq %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_v5i64:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpmuludq %zmm0, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <5 x i64> %x, splat (i64 67108863)
%y_masked = and <5 x i64> %x, splat (i64 67108863)
%mul = mul <5 x i64> %x_masked, %y_masked
%res = add <5 x i64> %mul, %z
ret <5 x i64> %res
}
define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) {
; AVXIFMA-LABEL: test_v6i64:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: movq %rdi, %rax
; AVXIFMA-NEXT: vmovq %r8, %xmm0
; AVXIFMA-NEXT: vmovq %rcx, %xmm1
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVXIFMA-NEXT: vmovq %rdx, %xmm1
; AVXIFMA-NEXT: vmovq %rsi, %xmm2
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863]
; AVXIFMA-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm1
; AVXIFMA-NEXT: vmovq %r9, %xmm0
; AVXIFMA-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVXIFMA-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVXIFMA-NEXT: vpmuldq %xmm0, %xmm0, %xmm0
; AVXIFMA-NEXT: vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVXIFMA-NEXT: vmovdqa %xmm0, 32(%rdi)
; AVXIFMA-NEXT: vmovdqa %ymm1, (%rdi)
; AVXIFMA-NEXT: vzeroupper
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_v6i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: vpmuludq %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_v6i64:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpmuludq %zmm0, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <6 x i64> %x, splat (i64 67108863)
%y_masked = and <6 x i64> %x, splat (i64 67108863)
%mul = mul <6 x i64> %x_masked, %y_masked
%res = add <6 x i64> %mul, %z
ret <6 x i64> %res
}
define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) {
; AVXIFMA-LABEL: test_v9i64:
; AVXIFMA: # %bb.0:
; AVXIFMA-NEXT: movq %rdi, %rax
; AVXIFMA-NEXT: vmovq %r8, %xmm0
; AVXIFMA-NEXT: vmovq %rcx, %xmm1
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVXIFMA-NEXT: vmovq %rdx, %xmm1
; AVXIFMA-NEXT: vmovq %rsi, %xmm2
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVXIFMA-NEXT: vmovq %r9, %xmm1
; AVXIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVXIFMA-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
; AVXIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3
; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm4
; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863]
; AVXIFMA-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVXIFMA-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVXIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF
; AVXIFMA-NEXT: vmovq %rcx, %xmm5
; AVXIFMA-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero
; AVXIFMA-NEXT: vpand %xmm5, %xmm6, %xmm5
; AVXIFMA-NEXT: vpsrlq $32, %xmm5, %xmm6
; AVXIFMA-NEXT: vpmuludq %xmm6, %xmm5, %xmm6
; AVXIFMA-NEXT: vpsllq $33, %xmm6, %xmm6
; AVXIFMA-NEXT: vpmuludq %xmm5, %xmm5, %xmm5
; AVXIFMA-NEXT: vpaddq %xmm2, %xmm5, %xmm2
; AVXIFMA-NEXT: vpaddq %xmm6, %xmm2, %xmm2
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm4
; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm1, %ymm3
; AVXIFMA-NEXT: vmovdqa %ymm3, 32(%rdi)
; AVXIFMA-NEXT: vmovdqa %ymm4, (%rdi)
; AVXIFMA-NEXT: vmovq %xmm2, 64(%rdi)
; AVXIFMA-NEXT: vzeroupper
; AVXIFMA-NEXT: retq
;
; AVX512-LABEL: test_v9i64:
; AVX512: # %bb.0:
; AVX512-NEXT: movq %rdi, %rax
; AVX512-NEXT: vmovq %r8, %xmm0
; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: vmovq %rdx, %xmm1
; AVX512-NEXT: vmovq %rsi, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NEXT: vmovq %r9, %xmm1
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm2
; AVX512-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF
; AVX512-NEXT: vmovq %rcx, %xmm3
; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX512-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVX512-NEXT: vpsrlq $32, %xmm3, %xmm4
; AVX512-NEXT: vpmuludq %xmm4, %xmm3, %xmm4
; AVX512-NEXT: vpsllq $33, %xmm4, %xmm4
; AVX512-NEXT: vpmuludq %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpaddq %xmm1, %xmm3, %xmm1
; AVX512-NEXT: vpaddq %xmm4, %xmm1, %xmm1
; AVX512-NEXT: vpmadd52luq %zmm0, %zmm0, %zmm2
; AVX512-NEXT: vmovq %xmm1, 64(%rdi)
; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-NOIFMA-LABEL: test_v9i64:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: movq %rdi, %rax
; AVX512-NOIFMA-NEXT: vmovq %r8, %xmm0
; AVX512-NOIFMA-NEXT: vmovq %rcx, %xmm1
; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NOIFMA-NEXT: vmovq %rdx, %xmm1
; AVX512-NOIFMA-NEXT: vmovq %rsi, %xmm2
; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NOIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512-NOIFMA-NEXT: vmovq %r9, %xmm1
; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512-NOIFMA-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NOIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF
; AVX512-NOIFMA-NEXT: vmovq %rcx, %xmm2
; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX512-NOIFMA-NEXT: vpand %xmm2, %xmm3, %xmm2
; AVX512-NOIFMA-NEXT: vpsrlq $32, %xmm2, %xmm3
; AVX512-NOIFMA-NEXT: vpmuludq %xmm3, %xmm2, %xmm3
; AVX512-NOIFMA-NEXT: vpsllq $33, %xmm3, %xmm3
; AVX512-NOIFMA-NEXT: vpmuludq %xmm2, %xmm2, %xmm2
; AVX512-NOIFMA-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX512-NOIFMA-NEXT: vpaddq %xmm3, %xmm1, %xmm1
; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512-NOIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm2, %ymm3
; AVX512-NOIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2
; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
; AVX512-NOIFMA-NEXT: vmovq %xmm1, 64(%rdi)
; AVX512-NOIFMA-NEXT: vmovdqa64 %zmm0, (%rdi)
; AVX512-NOIFMA-NEXT: vzeroupper
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <9 x i64> %x, splat (i64 67108863)
%y_masked = and <9 x i64> %x, splat (i64 67108863)
%mul = mul <9 x i64> %x_masked, %y_masked
%res = add <9 x i64> %mul, %z
ret <9 x i64> %res
}