llvm/test/CodeGen/X86/avx512fp16-combine-vfmac-fadd.ll - llvm-project - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,NO-SZ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,HAS-SZ

 ; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
 define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
 ; NO-SZ-LABEL: test1:
 ; NO-SZ:       # %bb.0: # %entry
 ; NO-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
 ; NO-SZ-NEXT:    retq
 ;
 ; HAS-SZ-LABEL: test1:
 ; HAS-SZ:       # %bb.0: # %entry
 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; HAS-SZ-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm3
 ; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
 ; HAS-SZ-NEXT:    retq
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
   %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
   %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }

 define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
 ; NO-SZ-LABEL: test2:
 ; NO-SZ:       # %bb.0: # %entry
 ; NO-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
 ; NO-SZ-NEXT:    retq
 ;
 ; HAS-SZ-LABEL: test2:
 ; HAS-SZ:       # %bb.0: # %entry
 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; HAS-SZ-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm3
 ; HAS-SZ-NEXT:    vaddph %zmm0, %zmm3, %zmm0
 ; HAS-SZ-NEXT:    retq
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
   %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
   %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }

 define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
 ; NO-SZ-LABEL: test3:
 ; NO-SZ:       # %bb.0: # %entry
 ; NO-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
 ; NO-SZ-NEXT:    retq
 ;
 ; HAS-SZ-LABEL: test3:
 ; HAS-SZ:       # %bb.0: # %entry
 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; HAS-SZ-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm3
 ; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
 ; HAS-SZ-NEXT:    retq
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
   %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
   %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }

 define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
 ; NO-SZ-LABEL: test4:
 ; NO-SZ:       # %bb.0: # %entry
 ; NO-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
 ; NO-SZ-NEXT:    retq
 ;
 ; HAS-SZ-LABEL: test4:
 ; HAS-SZ:       # %bb.0: # %entry
 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; HAS-SZ-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm3
 ; HAS-SZ-NEXT:    vaddph %ymm0, %ymm3, %ymm0
 ; HAS-SZ-NEXT:    retq
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
   %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
   %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }

 define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 ; NO-SZ-LABEL: test5:
 ; NO-SZ:       # %bb.0: # %entry
 ; NO-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
 ; NO-SZ-NEXT:    retq
 ;
 ; HAS-SZ-LABEL: test5:
 ; HAS-SZ:       # %bb.0: # %entry
 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; HAS-SZ-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm3
 ; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
 ; HAS-SZ-NEXT:    retq
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
   %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
   %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }

 define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 ; NO-SZ-LABEL: test6:
 ; NO-SZ:       # %bb.0: # %entry
 ; NO-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
 ; NO-SZ-NEXT:    retq
 ;
 ; HAS-SZ-LABEL: test6:
 ; HAS-SZ:       # %bb.0: # %entry
 ; HAS-SZ-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; HAS-SZ-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm3
 ; HAS-SZ-NEXT:    vaddph %xmm0, %xmm3, %xmm0
 ; HAS-SZ-NEXT:    retq
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
   %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
   %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }

 ; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
 define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
 ; CHECK-LABEL: test13:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfcmaddcph %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
   %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
   %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }

 define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
 ; CHECK-LABEL: test14:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddcph %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <32 x half> %a to <16 x float>
   %1 = bitcast <32 x half> %b to <16 x float>
   %2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
   %3 = bitcast <16 x float> %2 to <32 x half>
   %add.i = fadd contract <32 x half> %3, %acc
   ret <32 x half> %add.i
 }

 define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
 ; CHECK-LABEL: test15:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfcmaddcph %ymm2, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
   %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
   %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }

 define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
 ; CHECK-LABEL: test16:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddcph %ymm2, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <16 x half> %a to <8 x float>
   %1 = bitcast <16 x half> %b to <8 x float>
   %2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <8 x float> %2 to <16 x half>
   %add.i = fadd contract <16 x half> %3, %acc
   ret <16 x half> %add.i
 }

 define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: test17:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfcmaddcph %xmm2, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
   %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
   %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }

 define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
 ; CHECK-LABEL: test18:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vfmaddcph %xmm2, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <8 x half> %a to <4 x float>
   %1 = bitcast <8 x half> %b to <4 x float>
   %2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
   %3 = bitcast <4 x float> %2 to <8 x half>
   %add.i = fadd contract <8 x half> %3, %acc
   ret <8 x half> %add.i
 }

 declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
 declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown --enable-no-signed-zeros-fp-math -mattr=avx512fp16,avx512vl \| FileCheck %s --check-prefixes=CHECK,NO-SZ
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl \| FileCheck %s --check-prefixes=CHECK,HAS-SZ

	; FADD(acc, FMA(a, b, +0.0)) can be combined to FMA(a, b, acc) if the nsz flag set.
	define dso_local <32 x half> @test1(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
	; NO-SZ-LABEL: test1:
	; NO-SZ: # %bb.0: # %entry
	; NO-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0
	; NO-SZ-NEXT: retq
	;
	; HAS-SZ-LABEL: test1:
	; HAS-SZ: # %bb.0: # %entry
	; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
	; HAS-SZ-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm3
	; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0
	; HAS-SZ-NEXT: retq
	entry:
	%0 = bitcast <32 x half> %a to <16 x float>
	%1 = bitcast <32 x half> %b to <16 x float>
	%2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
	%3 = bitcast <16 x float> %2 to <32 x half>
	%add.i = fadd contract <32 x half> %3, %acc
	ret <32 x half> %add.i
	}

	define dso_local <32 x half> @test2(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
	; NO-SZ-LABEL: test2:
	; NO-SZ: # %bb.0: # %entry
	; NO-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0
	; NO-SZ-NEXT: retq
	;
	; HAS-SZ-LABEL: test2:
	; HAS-SZ: # %bb.0: # %entry
	; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
	; HAS-SZ-NEXT: vfmaddcph %zmm2, %zmm1, %zmm3
	; HAS-SZ-NEXT: vaddph %zmm0, %zmm3, %zmm0
	; HAS-SZ-NEXT: retq
	entry:
	%0 = bitcast <32 x half> %a to <16 x float>
	%1 = bitcast <32 x half> %b to <16 x float>
	%2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> zeroinitializer, i16 -1, i32 4)
	%3 = bitcast <16 x float> %2 to <32 x half>
	%add.i = fadd contract <32 x half> %3, %acc
	ret <32 x half> %add.i
	}

	define dso_local <16 x half> @test3(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
	; NO-SZ-LABEL: test3:
	; NO-SZ: # %bb.0: # %entry
	; NO-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0
	; NO-SZ-NEXT: retq
	;
	; HAS-SZ-LABEL: test3:
	; HAS-SZ: # %bb.0: # %entry
	; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
	; HAS-SZ-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm3
	; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0
	; HAS-SZ-NEXT: retq
	entry:
	%0 = bitcast <16 x half> %a to <8 x float>
	%1 = bitcast <16 x half> %b to <8 x float>
	%2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
	%3 = bitcast <8 x float> %2 to <16 x half>
	%add.i = fadd contract <16 x half> %3, %acc
	ret <16 x half> %add.i
	}

	define dso_local <16 x half> @test4(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
	; NO-SZ-LABEL: test4:
	; NO-SZ: # %bb.0: # %entry
	; NO-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0
	; NO-SZ-NEXT: retq
	;
	; HAS-SZ-LABEL: test4:
	; HAS-SZ: # %bb.0: # %entry
	; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
	; HAS-SZ-NEXT: vfmaddcph %ymm2, %ymm1, %ymm3
	; HAS-SZ-NEXT: vaddph %ymm0, %ymm3, %ymm0
	; HAS-SZ-NEXT: retq
	entry:
	%0 = bitcast <16 x half> %a to <8 x float>
	%1 = bitcast <16 x half> %b to <8 x float>
	%2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> zeroinitializer, i8 -1)
	%3 = bitcast <8 x float> %2 to <16 x half>
	%add.i = fadd contract <16 x half> %3, %acc
	ret <16 x half> %add.i
	}

	define dso_local <8 x half> @test5(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
	; NO-SZ-LABEL: test5:
	; NO-SZ: # %bb.0: # %entry
	; NO-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0
	; NO-SZ-NEXT: retq
	;
	; HAS-SZ-LABEL: test5:
	; HAS-SZ: # %bb.0: # %entry
	; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
	; HAS-SZ-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm3
	; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0
	; HAS-SZ-NEXT: retq
	entry:
	%0 = bitcast <8 x half> %a to <4 x float>
	%1 = bitcast <8 x half> %b to <4 x float>
	%2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
	%3 = bitcast <4 x float> %2 to <8 x half>
	%add.i = fadd contract <8 x half> %3, %acc
	ret <8 x half> %add.i
	}

	define dso_local <8 x half> @test6(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
	; NO-SZ-LABEL: test6:
	; NO-SZ: # %bb.0: # %entry
	; NO-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
	; NO-SZ-NEXT: retq
	;
	; HAS-SZ-LABEL: test6:
	; HAS-SZ: # %bb.0: # %entry
	; HAS-SZ-NEXT: vxorps %xmm3, %xmm3, %xmm3
	; HAS-SZ-NEXT: vfmaddcph %xmm2, %xmm1, %xmm3
	; HAS-SZ-NEXT: vaddph %xmm0, %xmm3, %xmm0
	; HAS-SZ-NEXT: retq
	entry:
	%0 = bitcast <8 x half> %a to <4 x float>
	%1 = bitcast <8 x half> %b to <4 x float>
	%2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> zeroinitializer, i8 -1)
	%3 = bitcast <4 x float> %2 to <8 x half>
	%add.i = fadd contract <8 x half> %3, %acc
	ret <8 x half> %add.i
	}

	; FADD(acc, FMA(a, b, -0.0)) can be combined to FMA(a, b, acc) no matter if the nsz flag set.
	define dso_local <32 x half> @test13(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
	; CHECK-LABEL: test13:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: retq
	entry:
	%0 = bitcast <32 x half> %a to <16 x float>
	%1 = bitcast <32 x half> %b to <16 x float>
	%2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
	%3 = bitcast <16 x float> %2 to <32 x half>
	%add.i = fadd contract <32 x half> %3, %acc
	ret <32 x half> %add.i
	}

	define dso_local <32 x half> @test14(<32 x half> %acc, <32 x half> %a, <32 x half> %b) {
	; CHECK-LABEL: test14:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0
	; CHECK-NEXT: retq
	entry:
	%0 = bitcast <32 x half> %a to <16 x float>
	%1 = bitcast <32 x half> %b to <16 x float>
	%2 = tail call contract <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %0, <16 x float> %1, <16 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i16 -1, i32 4)
	%3 = bitcast <16 x float> %2 to <32 x half>
	%add.i = fadd contract <32 x half> %3, %acc
	ret <32 x half> %add.i
	}

	define dso_local <16 x half> @test15(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
	; CHECK-LABEL: test15:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	entry:
	%0 = bitcast <16 x half> %a to <8 x float>
	%1 = bitcast <16 x half> %b to <8 x float>
	%2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
	%3 = bitcast <8 x float> %2 to <16 x half>
	%add.i = fadd contract <16 x half> %3, %acc
	ret <16 x half> %add.i
	}

	define dso_local <16 x half> @test16(<16 x half> %acc, <16 x half> %a, <16 x half> %b) {
	; CHECK-LABEL: test16:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0
	; CHECK-NEXT: retq
	entry:
	%0 = bitcast <16 x half> %a to <8 x float>
	%1 = bitcast <16 x half> %b to <8 x float>
	%2 = tail call contract <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %0, <8 x float> %1, <8 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
	%3 = bitcast <8 x float> %2 to <16 x half>
	%add.i = fadd contract <16 x half> %3, %acc
	ret <16 x half> %add.i
	}

	define dso_local <8 x half> @test17(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
	; CHECK-LABEL: test17:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0
	; CHECK-NEXT: retq
	entry:
	%0 = bitcast <8 x half> %a to <4 x float>
	%1 = bitcast <8 x half> %b to <4 x float>
	%2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
	%3 = bitcast <4 x float> %2 to <8 x half>
	%add.i = fadd contract <8 x half> %3, %acc
	ret <8 x half> %add.i
	}

	define dso_local <8 x half> @test18(<8 x half> %acc, <8 x half> %a, <8 x half> %b) {
	; CHECK-LABEL: test18:
	; CHECK: # %bb.0: # %entry
	; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0
	; CHECK-NEXT: retq
	entry:
	%0 = bitcast <8 x half> %a to <4 x float>
	%1 = bitcast <8 x half> %b to <4 x float>
	%2 = tail call contract <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %0, <4 x float> %1, <4 x float> <float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000, float 0xB790000000000000>, i8 -1)
	%3 = bitcast <4 x float> %2 to <8 x half>
	%add.i = fadd contract <8 x half> %3, %acc
	ret <8 x half> %add.i
	}

	declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
	declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32 immarg)
	declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
	declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
	declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)
	declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8)