test/CodeGen/X86/sse-intel-ocl.ll - llvm-project/llvm - Git at Google

 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s

 declare <16 x float> @func_float16_ptr(<16 x float>, ptr)
 declare <16 x float> @func_float16(<16 x float>, <16 x float>)

 ;test calling conventions - input parameters
 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-LABEL: testf16_inp:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-16, %esp
 ; WIN32-NEXT:    subl $80, %esp
 ; WIN32-NEXT:    movups 72(%ebp), %xmm4
 ; WIN32-NEXT:    movups 8(%ebp), %xmm3
 ; WIN32-NEXT:    addps %xmm4, %xmm3
 ; WIN32-NEXT:    movups 56(%ebp), %xmm4
 ; WIN32-NEXT:    movups 40(%ebp), %xmm5
 ; WIN32-NEXT:    movups 24(%ebp), %xmm6
 ; WIN32-NEXT:    movl %esp, %eax
 ; WIN32-NEXT:    addps %xmm6, %xmm0
 ; WIN32-NEXT:    addps %xmm5, %xmm1
 ; WIN32-NEXT:    addps %xmm4, %xmm2
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    addps (%esp), %xmm0
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
 ;
 ; WIN64-LABEL: testf16_inp:
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    subq $104, %rsp
 ; WIN64-NEXT:    movaps (%r9), %xmm3
 ; WIN64-NEXT:    movaps (%r8), %xmm2
 ; WIN64-NEXT:    movaps (%rdx), %xmm1
 ; WIN64-NEXT:    movaps (%rcx), %xmm0
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; WIN64-NEXT:    addps (%r8), %xmm0
 ; WIN64-NEXT:    addps (%rdx), %xmm1
 ; WIN64-NEXT:    addps (%rcx), %xmm2
 ; WIN64-NEXT:    addps (%rax), %xmm3
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN64-NEXT:    callq func_float16_ptr
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
 ; WIN64-NEXT:    addq $104, %rsp
 ; WIN64-NEXT:    retq
 ;
 ; NOT_WIN-LABEL: testf16_inp:
 ; NOT_WIN:       ## %bb.0:
 ; NOT_WIN-NEXT:    subq $72, %rsp
 ; NOT_WIN-NEXT:    addps %xmm4, %xmm0
 ; NOT_WIN-NEXT:    addps %xmm5, %xmm1
 ; NOT_WIN-NEXT:    addps %xmm6, %xmm2
 ; NOT_WIN-NEXT:    addps %xmm7, %xmm3
 ; NOT_WIN-NEXT:    movq %rsp, %rdi
 ; NOT_WIN-NEXT:    callq _func_float16_ptr
 ; NOT_WIN-NEXT:    addps (%rsp), %xmm0
 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
 ; NOT_WIN-NEXT:    addq $72, %rsp
 ; NOT_WIN-NEXT:    retq
   %y = alloca <16 x float>, align 16
   %x = fadd <16 x float> %a, %b
   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
   %2 = load <16 x float>, ptr %y, align 16
   %3 = fadd <16 x float> %2, %1
   ret <16 x float> %3
 }

 ; test calling conventions - preserved registers

 ; preserves xmm6-xmm15 on windows, xmm8-xmm15 on other plateforms.
 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-LABEL: testf16_regs:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-16, %esp
 ; WIN32-NEXT:    subl $80, %esp
 ; WIN32-NEXT:    movups 72(%ebp), %xmm6
 ; WIN32-NEXT:    movups 8(%ebp), %xmm3
 ; WIN32-NEXT:    movups 56(%ebp), %xmm7
 ; WIN32-NEXT:    movups 40(%ebp), %xmm5
 ; WIN32-NEXT:    movups 24(%ebp), %xmm4
 ; WIN32-NEXT:    movl %esp, %eax
 ; WIN32-NEXT:    addps %xmm4, %xmm0
 ; WIN32-NEXT:    addps %xmm5, %xmm1
 ; WIN32-NEXT:    addps %xmm7, %xmm2
 ; WIN32-NEXT:    addps %xmm6, %xmm3
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    movups 72(%ebp), %xmm4
 ; WIN32-NEXT:    addps %xmm4, %xmm3
 ; WIN32-NEXT:    movups 56(%ebp), %xmm4
 ; WIN32-NEXT:    addps %xmm4, %xmm2
 ; WIN32-NEXT:    movups 40(%ebp), %xmm4
 ; WIN32-NEXT:    addps %xmm4, %xmm1
 ; WIN32-NEXT:    movups 24(%ebp), %xmm4
 ; WIN32-NEXT:    addps %xmm4, %xmm0
 ; WIN32-NEXT:    addps (%esp), %xmm0
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
 ;
 ; WIN64-LABEL: testf16_regs:
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    subq $168, %rsp
 ; WIN64-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; WIN64-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movaps (%rax), %xmm6
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movaps (%rax), %xmm7
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movaps (%rax), %xmm8
 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movaps (%rax), %xmm9
 ; WIN64-NEXT:    movaps (%rcx), %xmm0
 ; WIN64-NEXT:    addps %xmm9, %xmm0
 ; WIN64-NEXT:    movaps (%rdx), %xmm1
 ; WIN64-NEXT:    addps %xmm8, %xmm1
 ; WIN64-NEXT:    movaps (%r8), %xmm2
 ; WIN64-NEXT:    addps %xmm7, %xmm2
 ; WIN64-NEXT:    movaps (%r9), %xmm3
 ; WIN64-NEXT:    addps %xmm6, %xmm3
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN64-NEXT:    callq func_float16_ptr
 ; WIN64-NEXT:    addps %xmm6, %xmm3
 ; WIN64-NEXT:    addps %xmm7, %xmm2
 ; WIN64-NEXT:    addps %xmm8, %xmm1
 ; WIN64-NEXT:    addps %xmm9, %xmm0
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; WIN64-NEXT:    addq $168, %rsp
 ; WIN64-NEXT:    retq
 ;
 ; NOT_WIN-LABEL: testf16_regs:
 ; NOT_WIN:       ## %bb.0:
 ; NOT_WIN-NEXT:    subq $72, %rsp
 ; NOT_WIN-NEXT:    movaps %xmm7, %xmm9
 ; NOT_WIN-NEXT:    movaps %xmm6, %xmm10
 ; NOT_WIN-NEXT:    movaps %xmm5, %xmm11
 ; NOT_WIN-NEXT:    movaps %xmm4, %xmm8
 ; NOT_WIN-NEXT:    addps %xmm4, %xmm0
 ; NOT_WIN-NEXT:    addps %xmm5, %xmm1
 ; NOT_WIN-NEXT:    addps %xmm6, %xmm2
 ; NOT_WIN-NEXT:    addps %xmm7, %xmm3
 ; NOT_WIN-NEXT:    movq %rsp, %rdi
 ; NOT_WIN-NEXT:    callq _func_float16_ptr
 ; NOT_WIN-NEXT:    addps %xmm9, %xmm3
 ; NOT_WIN-NEXT:    addps %xmm10, %xmm2
 ; NOT_WIN-NEXT:    addps %xmm11, %xmm1
 ; NOT_WIN-NEXT:    addps %xmm8, %xmm0
 ; NOT_WIN-NEXT:    addps (%rsp), %xmm0
 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
 ; NOT_WIN-NEXT:    addq $72, %rsp
 ; NOT_WIN-NEXT:    retq
   %y = alloca <16 x float>, align 16
   %x = fadd <16 x float> %a, %b
   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
   %2 = load <16 x float>, ptr %y, align 16
   %3 = fadd <16 x float> %1, %b
   %4 = fadd <16 x float> %2, %3
   ret <16 x float> %4
 }

 ; test calling conventions - prolog and epilog
 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
 ; WIN32-LABEL: test_prolog_epilog:
 ; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    movl %esp, %ebp
 ; WIN32-NEXT:    andl $-16, %esp
 ; WIN32-NEXT:    subl $96, %esp
 ; WIN32-NEXT:    movups 8(%ebp), %xmm4
 ; WIN32-NEXT:    movups 24(%ebp), %xmm5
 ; WIN32-NEXT:    movups 40(%ebp), %xmm6
 ; WIN32-NEXT:    movups 56(%ebp), %xmm7
 ; WIN32-NEXT:    movups %xmm7, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups %xmm6, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups %xmm5, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movups %xmm3, (%esp)
 ; WIN32-NEXT:    calll _func_float16
 ; WIN32-NEXT:    movl %ebp, %esp
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
 ;
 ; WIN64-LABEL: test_prolog_epilog:
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    subq $232, %rsp
 ; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; WIN64-NEXT:    movaps (%r9), %xmm4
 ; WIN64-NEXT:    movaps (%rdx), %xmm5
 ; WIN64-NEXT:    movaps (%r8), %xmm6
 ; WIN64-NEXT:    movaps (%rcx), %xmm7
 ; WIN64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
 ; WIN64-NEXT:    callq func_float16
 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; WIN64-NEXT:    addq $232, %rsp
 ; WIN64-NEXT:    retq
 ;
 ; NOT_WIN-LABEL: test_prolog_epilog:
 ; NOT_WIN:       ## %bb.0:
 ; NOT_WIN-NEXT:    subq $136, %rsp
 ; NOT_WIN-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; NOT_WIN-NEXT:    movaps %xmm8, (%rsp) ## 16-byte Spill
 ; NOT_WIN-NEXT:    callq _func_float16
 ; NOT_WIN-NEXT:    movaps (%rsp), %xmm8 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 ## 16-byte Reload
 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 ## 16-byte Reload
 ; NOT_WIN-NEXT:    addq $136, %rsp
 ; NOT_WIN-NEXT:    retq
    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
    ret <16 x float> %c
 }
	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem \| FileCheck -check-prefix=WIN32 %s
	; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem \| FileCheck -check-prefix=WIN64 %s
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem \| FileCheck -check-prefix=NOT_WIN %s

	declare <16 x float> @func_float16_ptr(<16 x float>, ptr)
	declare <16 x float> @func_float16(<16 x float>, <16 x float>)

	;test calling conventions - input parameters
	define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
	; WIN32-LABEL: testf16_inp:
	; WIN32: # %bb.0:
	; WIN32-NEXT: pushl %ebp
	; WIN32-NEXT: movl %esp, %ebp
	; WIN32-NEXT: andl $-16, %esp
	; WIN32-NEXT: subl $80, %esp
	; WIN32-NEXT: movups 72(%ebp), %xmm4
	; WIN32-NEXT: movups 8(%ebp), %xmm3
	; WIN32-NEXT: addps %xmm4, %xmm3
	; WIN32-NEXT: movups 56(%ebp), %xmm4
	; WIN32-NEXT: movups 40(%ebp), %xmm5
	; WIN32-NEXT: movups 24(%ebp), %xmm6
	; WIN32-NEXT: movl %esp, %eax
	; WIN32-NEXT: addps %xmm6, %xmm0
	; WIN32-NEXT: addps %xmm5, %xmm1
	; WIN32-NEXT: addps %xmm4, %xmm2
	; WIN32-NEXT: pushl %eax
	; WIN32-NEXT: calll _func_float16_ptr
	; WIN32-NEXT: addl $4, %esp
	; WIN32-NEXT: addps (%esp), %xmm0
	; WIN32-NEXT: addps {{[0-9]+}}(%esp), %xmm1
	; WIN32-NEXT: addps {{[0-9]+}}(%esp), %xmm2
	; WIN32-NEXT: addps {{[0-9]+}}(%esp), %xmm3
	; WIN32-NEXT: movl %ebp, %esp
	; WIN32-NEXT: popl %ebp
	; WIN32-NEXT: retl
	;
	; WIN64-LABEL: testf16_inp:
	; WIN64: # %bb.0:
	; WIN64-NEXT: subq $104, %rsp
	; WIN64-NEXT: movaps (%r9), %xmm3
	; WIN64-NEXT: movaps (%r8), %xmm2
	; WIN64-NEXT: movaps (%rdx), %xmm1
	; WIN64-NEXT: movaps (%rcx), %xmm0
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %r8
	; WIN64-NEXT: addps (%r8), %xmm0
	; WIN64-NEXT: addps (%rdx), %xmm1
	; WIN64-NEXT: addps (%rcx), %xmm2
	; WIN64-NEXT: addps (%rax), %xmm3
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
	; WIN64-NEXT: callq func_float16_ptr
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm0
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm1
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm2
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm3
	; WIN64-NEXT: addq $104, %rsp
	; WIN64-NEXT: retq
	;
	; NOT_WIN-LABEL: testf16_inp:
	; NOT_WIN: ## %bb.0:
	; NOT_WIN-NEXT: subq $72, %rsp
	; NOT_WIN-NEXT: addps %xmm4, %xmm0
	; NOT_WIN-NEXT: addps %xmm5, %xmm1
	; NOT_WIN-NEXT: addps %xmm6, %xmm2
	; NOT_WIN-NEXT: addps %xmm7, %xmm3
	; NOT_WIN-NEXT: movq %rsp, %rdi
	; NOT_WIN-NEXT: callq _func_float16_ptr
	; NOT_WIN-NEXT: addps (%rsp), %xmm0
	; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm1
	; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm2
	; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm3
	; NOT_WIN-NEXT: addq $72, %rsp
	; NOT_WIN-NEXT: retq
	%y = alloca <16 x float>, align 16
	%x = fadd <16 x float> %a, %b
	%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
	%2 = load <16 x float>, ptr %y, align 16
	%3 = fadd <16 x float> %2, %1
	ret <16 x float> %3
	}

	; test calling conventions - preserved registers

	; preserves xmm6-xmm15 on windows, xmm8-xmm15 on other plateforms.
	define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
	; WIN32-LABEL: testf16_regs:
	; WIN32: # %bb.0:
	; WIN32-NEXT: pushl %ebp
	; WIN32-NEXT: movl %esp, %ebp
	; WIN32-NEXT: andl $-16, %esp
	; WIN32-NEXT: subl $80, %esp
	; WIN32-NEXT: movups 72(%ebp), %xmm6
	; WIN32-NEXT: movups 8(%ebp), %xmm3
	; WIN32-NEXT: movups 56(%ebp), %xmm7
	; WIN32-NEXT: movups 40(%ebp), %xmm5
	; WIN32-NEXT: movups 24(%ebp), %xmm4
	; WIN32-NEXT: movl %esp, %eax
	; WIN32-NEXT: addps %xmm4, %xmm0
	; WIN32-NEXT: addps %xmm5, %xmm1
	; WIN32-NEXT: addps %xmm7, %xmm2
	; WIN32-NEXT: addps %xmm6, %xmm3
	; WIN32-NEXT: pushl %eax
	; WIN32-NEXT: calll _func_float16_ptr
	; WIN32-NEXT: addl $4, %esp
	; WIN32-NEXT: movups 72(%ebp), %xmm4
	; WIN32-NEXT: addps %xmm4, %xmm3
	; WIN32-NEXT: movups 56(%ebp), %xmm4
	; WIN32-NEXT: addps %xmm4, %xmm2
	; WIN32-NEXT: movups 40(%ebp), %xmm4
	; WIN32-NEXT: addps %xmm4, %xmm1
	; WIN32-NEXT: movups 24(%ebp), %xmm4
	; WIN32-NEXT: addps %xmm4, %xmm0
	; WIN32-NEXT: addps (%esp), %xmm0
	; WIN32-NEXT: addps {{[0-9]+}}(%esp), %xmm1
	; WIN32-NEXT: addps {{[0-9]+}}(%esp), %xmm2
	; WIN32-NEXT: addps {{[0-9]+}}(%esp), %xmm3
	; WIN32-NEXT: movl %ebp, %esp
	; WIN32-NEXT: popl %ebp
	; WIN32-NEXT: retl
	;
	; WIN64-LABEL: testf16_regs:
	; WIN64: # %bb.0:
	; WIN64-NEXT: subq $168, %rsp
	; WIN64-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
	; WIN64-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
	; WIN64-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
	; WIN64-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movaps (%rax), %xmm6
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movaps (%rax), %xmm7
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movaps (%rax), %xmm8
	; WIN64-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movaps (%rax), %xmm9
	; WIN64-NEXT: movaps (%rcx), %xmm0
	; WIN64-NEXT: addps %xmm9, %xmm0
	; WIN64-NEXT: movaps (%rdx), %xmm1
	; WIN64-NEXT: addps %xmm8, %xmm1
	; WIN64-NEXT: movaps (%r8), %xmm2
	; WIN64-NEXT: addps %xmm7, %xmm2
	; WIN64-NEXT: movaps (%r9), %xmm3
	; WIN64-NEXT: addps %xmm6, %xmm3
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
	; WIN64-NEXT: callq func_float16_ptr
	; WIN64-NEXT: addps %xmm6, %xmm3
	; WIN64-NEXT: addps %xmm7, %xmm2
	; WIN64-NEXT: addps %xmm8, %xmm1
	; WIN64-NEXT: addps %xmm9, %xmm0
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm0
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm1
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm2
	; WIN64-NEXT: addps {{[0-9]+}}(%rsp), %xmm3
	; WIN64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
	; WIN64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
	; WIN64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
	; WIN64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
	; WIN64-NEXT: addq $168, %rsp
	; WIN64-NEXT: retq
	;
	; NOT_WIN-LABEL: testf16_regs:
	; NOT_WIN: ## %bb.0:
	; NOT_WIN-NEXT: subq $72, %rsp
	; NOT_WIN-NEXT: movaps %xmm7, %xmm9
	; NOT_WIN-NEXT: movaps %xmm6, %xmm10
	; NOT_WIN-NEXT: movaps %xmm5, %xmm11
	; NOT_WIN-NEXT: movaps %xmm4, %xmm8
	; NOT_WIN-NEXT: addps %xmm4, %xmm0
	; NOT_WIN-NEXT: addps %xmm5, %xmm1
	; NOT_WIN-NEXT: addps %xmm6, %xmm2
	; NOT_WIN-NEXT: addps %xmm7, %xmm3
	; NOT_WIN-NEXT: movq %rsp, %rdi
	; NOT_WIN-NEXT: callq _func_float16_ptr
	; NOT_WIN-NEXT: addps %xmm9, %xmm3
	; NOT_WIN-NEXT: addps %xmm10, %xmm2
	; NOT_WIN-NEXT: addps %xmm11, %xmm1
	; NOT_WIN-NEXT: addps %xmm8, %xmm0
	; NOT_WIN-NEXT: addps (%rsp), %xmm0
	; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm1
	; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm2
	; NOT_WIN-NEXT: addps {{[0-9]+}}(%rsp), %xmm3
	; NOT_WIN-NEXT: addq $72, %rsp
	; NOT_WIN-NEXT: retq
	%y = alloca <16 x float>, align 16
	%x = fadd <16 x float> %a, %b
	%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
	%2 = load <16 x float>, ptr %y, align 16
	%3 = fadd <16 x float> %1, %b
	%4 = fadd <16 x float> %2, %3
	ret <16 x float> %4
	}

	; test calling conventions - prolog and epilog
	define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
	; WIN32-LABEL: test_prolog_epilog:
	; WIN32: # %bb.0:
	; WIN32-NEXT: pushl %ebp
	; WIN32-NEXT: movl %esp, %ebp
	; WIN32-NEXT: andl $-16, %esp
	; WIN32-NEXT: subl $96, %esp
	; WIN32-NEXT: movups 8(%ebp), %xmm4
	; WIN32-NEXT: movups 24(%ebp), %xmm5
	; WIN32-NEXT: movups 40(%ebp), %xmm6
	; WIN32-NEXT: movups 56(%ebp), %xmm7
	; WIN32-NEXT: movups %xmm7, {{[0-9]+}}(%esp)
	; WIN32-NEXT: movups %xmm6, {{[0-9]+}}(%esp)
	; WIN32-NEXT: movups %xmm5, {{[0-9]+}}(%esp)
	; WIN32-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
	; WIN32-NEXT: movups %xmm3, (%esp)
	; WIN32-NEXT: calll _func_float16
	; WIN32-NEXT: movl %ebp, %esp
	; WIN32-NEXT: popl %ebp
	; WIN32-NEXT: retl
	;
	; WIN64-LABEL: test_prolog_epilog:
	; WIN64: # %bb.0:
	; WIN64-NEXT: subq $232, %rsp
	; WIN64-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
	; WIN64-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
	; WIN64-NEXT: movaps (%r9), %xmm4
	; WIN64-NEXT: movaps (%rdx), %xmm5
	; WIN64-NEXT: movaps (%r8), %xmm6
	; WIN64-NEXT: movaps (%rcx), %xmm7
	; WIN64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm2, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm3, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm5, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: movaps %xmm4, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rax
	; WIN64-NEXT: movq %rax, {{[0-9]+}}(%rsp)
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %r8
	; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %r9
	; WIN64-NEXT: callq func_float16
	; WIN64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
	; WIN64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
	; WIN64-NEXT: addq $232, %rsp
	; WIN64-NEXT: retq
	;
	; NOT_WIN-LABEL: test_prolog_epilog:
	; NOT_WIN: ## %bb.0:
	; NOT_WIN-NEXT: subq $136, %rsp
	; NOT_WIN-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
	; NOT_WIN-NEXT: movaps %xmm8, (%rsp) ## 16-byte Spill
	; NOT_WIN-NEXT: callq _func_float16
	; NOT_WIN-NEXT: movaps (%rsp), %xmm8 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 ## 16-byte Reload
	; NOT_WIN-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 ## 16-byte Reload
	; NOT_WIN-NEXT: addq $136, %rsp
	; NOT_WIN-NEXT: retq
	%c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
	ret <16 x float> %c
	}