llvm-gcc-4.2/gcc/config/ia64/lib1funcs.asm - llvm-archive - Git at Google

 /* Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com>.

    This file is part of GCC.

    GCC is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2, or (at your option)
    any later version.

    GCC is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with GCC; see the file COPYING.  If not, write to
    the Free Software Foundation, 51 Franklin Street, Fifth Floor,
    Boston, MA 02110-1301, USA.  */

 /* As a special exception, if you link this library with other files,
    some of which are compiled with GCC, to produce an executable,
    this library does not by itself cause the resulting executable
    to be covered by the GNU General Public License.
    This exception does not however invalidate any other reasons why
    the executable file might be covered by the GNU General Public License.  */

 #ifdef L__divxf3
 // Compute a 80-bit IEEE double-extended quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // farg0 holds the dividend.  farg1 holds the divisor.
 //
 // __divtf3 is an alternate symbol name for backward compatibility.

 	.text
 	.align 16
 	.global __divxf3
 	.global __divtf3
 	.proc __divxf3
 __divxf3:
 __divtf3:
 	cmp.eq p7, p0 = r0, r0
 	frcpa.s0 f10, p6 = farg0, farg1
 	;;
 (p6)	cmp.ne p7, p0 = r0, r0
 	.pred.rel.mutex p6, p7
 (p6)	fnma.s1 f11 = farg1, f10, f1
 (p6)	fma.s1 f12 = farg0, f10, f0
 	;;
 (p6)	fma.s1 f13 = f11, f11, f0
 (p6)	fma.s1 f14 = f11, f11, f11
 	;;
 (p6)	fma.s1 f11 = f13, f13, f11
 (p6)	fma.s1 f13 = f14, f10, f10
 	;;
 (p6)	fma.s1 f10 = f13, f11, f10
 (p6)	fnma.s1 f11 = farg1, f12, farg0
 	;;
 (p6)	fma.s1 f11 = f11, f10, f12
 (p6)	fnma.s1 f12 = farg1, f10, f1
 	;;
 (p6)	fma.s1 f10 = f12, f10, f10
 (p6)	fnma.s1 f12 = farg1, f11, farg0
 	;;
 (p6)	fma.s0 fret0 = f12, f10, f11
 (p7)	mov fret0 = f10
 	br.ret.sptk rp
 	.endp __divxf3
 #endif

 #ifdef L__divdf3
 // Compute a 64-bit IEEE double quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // farg0 holds the dividend.  farg1 holds the divisor.

 	.text
 	.align 16
 	.global __divdf3
 	.proc __divdf3
 __divdf3:
 	cmp.eq p7, p0 = r0, r0
 	frcpa.s0 f10, p6 = farg0, farg1
 	;;
 (p6)	cmp.ne p7, p0 = r0, r0
 	.pred.rel.mutex p6, p7
 (p6)	fmpy.s1 f11 = farg0, f10
 (p6)	fnma.s1 f12 = farg1, f10, f1
 	;;
 (p6)	fma.s1 f11 = f12, f11, f11
 (p6)	fmpy.s1 f13 = f12, f12
 	;;
 (p6)	fma.s1 f10 = f12, f10, f10
 (p6)	fma.s1 f11 = f13, f11, f11
 	;;
 (p6)	fmpy.s1 f12 = f13, f13
 (p6)	fma.s1 f10 = f13, f10, f10
 	;;
 (p6)	fma.d.s1 f11 = f12, f11, f11
 (p6)	fma.s1 f10 = f12, f10, f10
 	;;
 (p6)	fnma.d.s1 f8 = farg1, f11, farg0
 	;;
 (p6)	fma.d fret0 = f8, f10, f11
 (p7)	mov fret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divdf3
 #endif

 #ifdef L__divsf3
 // Compute a 32-bit IEEE float quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // farg0 holds the dividend.  farg1 holds the divisor.

 	.text
 	.align 16
 	.global __divsf3
 	.proc __divsf3
 __divsf3:
 	cmp.eq p7, p0 = r0, r0
 	frcpa.s0 f10, p6 = farg0, farg1
 	;;
 (p6)	cmp.ne p7, p0 = r0, r0
 	.pred.rel.mutex p6, p7
 (p6)	fmpy.s1 f8 = farg0, f10
 (p6)	fnma.s1 f9 = farg1, f10, f1
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fmpy.s1 f9 = f9, f9
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fmpy.s1 f9 = f9, f9
 	;;
 (p6)	fma.d.s1 f10 = f9, f8, f8
 	;;
 (p6)	fnorm.s.s0 fret0 = f10
 (p7)	mov fret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divsf3
 #endif

 #ifdef L__divdi3
 // Compute a 64-bit integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __divdi3
 	.proc __divdi3
 __divdi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, so that they won't be treated as unsigned.
 	fcvt.xf f8 = f8
 	fcvt.xf f9 = f9
 (p7)	break 1
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fnma.s1 f11 = f9, f10, f1
 (p6)	fmpy.s1 f12 = f8, f10
 	;;
 (p6)	fmpy.s1 f13 = f11, f11
 (p6)	fma.s1 f12 = f11, f12, f12
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	// Round quotient to an integer.
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divdi3
 #endif

 #ifdef L__moddi3
 // Compute a 64-bit integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend (a).  in1 holds the divisor (b).

 	.text
 	.align 16
 	.global __moddi3
 	.proc __moddi3
 __moddi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f14 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, so that they won't be treated as unsigned.
 	fcvt.xf f8 = f14
 	fcvt.xf f9 = f9
 (p7)	break 1
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f11 = f9, f10, f1
 	;;
 (p6)	fma.s1 f12 = f11, f12, f12
 (p6)	fmpy.s1 f13 = f11, f11
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 	sub in1 = r0, in1
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	// r = q * (-b) + a
 	xma.l f10 = f10, f9, f14
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __moddi3
 #endif

 #ifdef L__udivdi3
 // Compute a 64-bit unsigned integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __udivdi3
 	.proc __udivdi3
 __udivdi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, to avoid FP software-assist faults.
 	fcvt.xuf.s1 f8 = f8
 	fcvt.xuf.s1 f9 = f9
 (p7)	break 1
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fnma.s1 f11 = f9, f10, f1
 (p6)	fmpy.s1 f12 = f8, f10
 	;;
 (p6)	fmpy.s1 f13 = f11, f11
 (p6)	fma.s1 f12 = f11, f12, f12
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	// Round quotient to an unsigned integer.
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __udivdi3
 #endif

 #ifdef L__umoddi3
 // Compute a 64-bit unsigned integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend (a).  in1 holds the divisor (b).

 	.text
 	.align 16
 	.global __umoddi3
 	.proc __umoddi3
 __umoddi3:
 	.regstk 2,0,0,0
 	// Transfer inputs to FP registers.
 	setf.sig f14 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	// Convert the inputs to FP, to avoid FP software assist faults.
 	fcvt.xuf.s1 f8 = f14
 	fcvt.xuf.s1 f9 = f9
 (p7)	break 1;
 	;;
 	// Compute the reciprocal approximation.
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 	// 3 Newton-Raphson iterations.
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f11 = f9, f10, f1
 	;;
 (p6)	fma.s1 f12 = f11, f12, f12
 (p6)	fmpy.s1 f13 = f11, f11
 	;;
 (p6)	fma.s1 f10 = f11, f10, f10
 (p6)	fma.s1 f11 = f13, f12, f12
 	;;
 	sub in1 = r0, in1
 (p6)	fma.s1 f10 = f13, f10, f10
 (p6)	fnma.s1 f12 = f9, f11, f8
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f10 = f12, f10, f11
 	;;
 	// Round quotient to an unsigned integer.
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	// r = q * (-b) + a
 	xma.l f10 = f10, f9, f14
 	;;
 	// Transfer result to GP registers.
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __umoddi3
 #endif

 #ifdef L__divsi3
 // Compute a 32-bit integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __divsi3
 	.proc __divsi3
 __divsi3:
 	.regstk 2,0,0,0
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	sxt4 in0 = in0
 	sxt4 in1 = in1
 	;;
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 (p7)	break 1
 	;;
 	mov r2 = 0x0ffdd
 	fcvt.xf f8 = f8
 	fcvt.xf f9 = f9
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 (p6)	fmpy.s1 f8 = f8, f10
 (p6)	fnma.s1 f9 = f9, f10, f1
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fma.s1 f9 = f9, f9, f11
 	;;
 (p6)	fma.s1 f10 = f9, f8, f8
 	;;
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __divsi3
 #endif

 #ifdef L__modsi3
 // Compute a 32-bit integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __modsi3
 	.proc __modsi3
 __modsi3:
 	.regstk 2,0,0,0
 	mov r2 = 0x0ffdd
 	sxt4 in0 = in0
 	sxt4 in1 = in1
 	;;
 	setf.sig f13 = r32
 	setf.sig f9 = r33
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	sub in1 = r0, in1
 	fcvt.xf f8 = f13
 	fcvt.xf f9 = f9
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 (p7)	break 1
 	;;
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f10 = f9, f10, f1
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f12 = f10, f12, f12
 (p6)	fma.s1 f10 = f10, f10, f11
 	;;
 (p6)	fma.s1 f10 = f10, f12, f12
 	;;
 	fcvt.fx.trunc.s1 f10 = f10
 	;;
 	xma.l f10 = f10, f9, f13
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __modsi3
 #endif

 #ifdef L__udivsi3
 // Compute a 32-bit unsigned integer quotient.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __udivsi3
 	.proc __udivsi3
 __udivsi3:
 	.regstk 2,0,0,0
 	mov r2 = 0x0ffdd
 	zxt4 in0 = in0
 	zxt4 in1 = in1
 	;;
 	setf.sig f8 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	fcvt.xf f8 = f8
 	fcvt.xf f9 = f9
 (p7)	break 1
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 	;;
 (p6)	fmpy.s1 f8 = f8, f10
 (p6)	fnma.s1 f9 = f9, f10, f1
 	;;
 (p6)	fma.s1 f8 = f9, f8, f8
 (p6)	fma.s1 f9 = f9, f9, f11
 	;;
 (p6)	fma.s1 f10 = f9, f8, f8
 	;;
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __udivsi3
 #endif

 #ifdef L__umodsi3
 // Compute a 32-bit unsigned integer modulus.
 //
 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 // alternative.
 //
 // in0 holds the dividend.  in1 holds the divisor.

 	.text
 	.align 16
 	.global __umodsi3
 	.proc __umodsi3
 __umodsi3:
 	.regstk 2,0,0,0
 	mov r2 = 0x0ffdd
 	zxt4 in0 = in0
 	zxt4 in1 = in1
 	;;
 	setf.sig f13 = in0
 	setf.sig f9 = in1
 	// Check divide by zero.
 	cmp.ne.unc p0,p7=0,in1
 	;;
 	sub in1 = r0, in1
 	fcvt.xf f8 = f13
 	fcvt.xf f9 = f9
 	;;
 	setf.exp f11 = r2
 	frcpa.s1 f10, p6 = f8, f9
 (p7)	break 1;
 	;;
 (p6)	fmpy.s1 f12 = f8, f10
 (p6)	fnma.s1 f10 = f9, f10, f1
 	;;
 	setf.sig f9 = in1
 (p6)	fma.s1 f12 = f10, f12, f12
 (p6)	fma.s1 f10 = f10, f10, f11
 	;;
 (p6)	fma.s1 f10 = f10, f12, f12
 	;;
 	fcvt.fxu.trunc.s1 f10 = f10
 	;;
 	xma.l f10 = f10, f9, f13
 	;;
 	getf.sig ret0 = f10
 	br.ret.sptk rp
 	;;
 	.endp __umodsi3
 #endif

 #ifdef L__save_stack_nonlocal
 // Notes on save/restore stack nonlocal: We read ar.bsp but write
 // ar.bspstore.  This is because ar.bsp can be read at all times
 // (independent of the RSE mode) but since it's read-only we need to
 // restore the value via ar.bspstore.  This is OK because
 // ar.bsp==ar.bspstore after executing "flushrs".

 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

 	.text
 	.align 16
 	.global __ia64_save_stack_nonlocal
 	.proc __ia64_save_stack_nonlocal
 __ia64_save_stack_nonlocal:
 	{ .mmf
 	  alloc r18 = ar.pfs, 2, 0, 0, 0
 	  mov r19 = ar.rsc
 	  ;;
 	}
 	{ .mmi
 	  flushrs
 	  st8 [in0] = in1, 24
 	  and r19 = 0x1c, r19
 	  ;;
 	}
 	{ .mmi
 	  st8 [in0] = r18, -16
 	  mov ar.rsc = r19
 	  or r19 = 0x3, r19
 	  ;;
 	}
 	{ .mmi
 	  mov r16 = ar.bsp
 	  mov r17 = ar.rnat
 	  adds r2 = 8, in0
 	  ;;
 	}
 	{ .mmi
 	  st8 [in0] = r16
 	  st8 [r2] = r17
 	}
 	{ .mib
 	  mov ar.rsc = r19
 	  br.ret.sptk.few rp
 	  ;;
 	}
 	.endp __ia64_save_stack_nonlocal
 #endif

 #ifdef L__nonlocal_goto
 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
 //			     void *static_chain);

 	.text
 	.align 16
 	.global __ia64_nonlocal_goto
 	.proc __ia64_nonlocal_goto
 __ia64_nonlocal_goto:
 	{ .mmi
 	  alloc r20 = ar.pfs, 3, 0, 0, 0
 	  ld8 r12 = [in1], 8
 	  mov.ret.sptk rp = in0, .L0
 	  ;;
 	}
 	{ .mmf
 	  ld8 r16 = [in1], 8
 	  mov r19 = ar.rsc
 	  ;;
 	}
 	{ .mmi
 	  flushrs
 	  ld8 r17 = [in1], 8
 	  and r19 = 0x1c, r19
 	  ;;
 	}
 	{ .mmi
 	  ld8 r18 = [in1]
 	  mov ar.rsc = r19
 	  or r19 = 0x3, r19
 	  ;;
 	}
 	{ .mmi
 	  mov ar.bspstore = r16
 	  ;;
 	  mov ar.rnat = r17
 	  ;;
 	}
 	{ .mmi
 	  loadrs
 	  invala
 	  mov r15 = in2
 	  ;;
 	}
 .L0:	{ .mib
 	  mov ar.rsc = r19
 	  mov ar.pfs = r18
 	  br.ret.sptk.few rp
 	  ;;
 	}
 	.endp __ia64_nonlocal_goto
 #endif

 #ifdef L__restore_stack_nonlocal
 // This is mostly the same as nonlocal_goto above.
 // ??? This has not been tested yet.

 // void __ia64_restore_stack_nonlocal(void *save_area)

 	.text
 	.align 16
 	.global __ia64_restore_stack_nonlocal
 	.proc __ia64_restore_stack_nonlocal
 __ia64_restore_stack_nonlocal:
 	{ .mmf
 	  alloc r20 = ar.pfs, 4, 0, 0, 0
 	  ld8 r12 = [in0], 8
 	  ;;
 	}
 	{ .mmb
 	  ld8 r16=[in0], 8
 	  mov r19 = ar.rsc
 	  ;;
 	}
 	{ .mmi
 	  flushrs
 	  ld8 r17 = [in0], 8
 	  and r19 = 0x1c, r19
 	  ;;
 	}
 	{ .mmf
 	  ld8 r18 = [in0]
 	  mov ar.rsc = r19
 	  ;;
 	}
 	{ .mmi
 	  mov ar.bspstore = r16
 	  ;;
 	  mov ar.rnat = r17
 	  or r19 = 0x3, r19
 	  ;;
 	}
 	{ .mmf
 	  loadrs
 	  invala
 	  ;;
 	}
 .L0:	{ .mib
 	  mov ar.rsc = r19
 	  mov ar.pfs = r18
 	  br.ret.sptk.few rp
 	  ;;
 	}
 	.endp __ia64_restore_stack_nonlocal
 #endif

 #ifdef L__trampoline
 // Implement the nested function trampoline.  This is out of line
 // so that we don't have to bother with flushing the icache, as
 // well as making the on-stack trampoline smaller.
 //
 // The trampoline has the following form:
 //
 //		+-------------------+ >
 //	TRAMP:	| __ia64_trampoline | |
 //		+-------------------+  > fake function descriptor
 //		| TRAMP+16          | |
 //		+-------------------+ >
 //		| target descriptor |
 //		+-------------------+
 //		| static link	    |
 //		+-------------------+

 	.text
 	.align 16
 	.global __ia64_trampoline
 	.proc __ia64_trampoline
 __ia64_trampoline:
 	{ .mmi
 	  ld8 r2 = [r1], 8
 	  ;;
 	  ld8 r15 = [r1]
 	}
 	{ .mmi
 	  ld8 r3 = [r2], 8
 	  ;;
 	  ld8 r1 = [r2]
 	  mov b6 = r3
 	}
 	{ .bbb
 	  br.sptk.many b6
 	  ;;
 	}
 	.endp __ia64_trampoline
 #endif

 // Thunks for backward compatibility.
 #ifdef L_fixtfdi
 	.text
 	.align 16
 	.global __fixtfti
 	.proc __fixtfti
 __fixtfti:
 	{ .bbb
 	  br.sptk.many __fixxfti
 	  ;;
 	}
 	.endp __fixtfti
 #endif
 #ifdef L_fixunstfdi
 	.align 16
 	.global __fixunstfti
 	.proc __fixunstfti
 __fixunstfti:
 	{ .bbb
 	  br.sptk.many __fixunsxfti
 	  ;;
 	}
 	.endp __fixunstfti
 #endif
 #if L_floatditf
 	.align 16
 	.global __floattitf
 	.proc __floattitf
 __floattitf:
 	{ .bbb
 	  br.sptk.many __floattixf
 	  ;;
 	}
 	.endp __floattitf
 #endif
	/* Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
	Contributed by James E. Wilson <wilson@cygnus.com>.

	This file is part of GCC.

	GCC is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 2, or (at your option)
	any later version.

	GCC is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with GCC; see the file COPYING. If not, write to
	the Free Software Foundation, 51 Franklin Street, Fifth Floor,
	Boston, MA 02110-1301, USA. */

	/* As a special exception, if you link this library with other files,
	some of which are compiled with GCC, to produce an executable,
	this library does not by itself cause the resulting executable
	to be covered by the GNU General Public License.
	This exception does not however invalidate any other reasons why
	the executable file might be covered by the GNU General Public License. */

	#ifdef L__divxf3
	// Compute a 80-bit IEEE double-extended quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// farg0 holds the dividend. farg1 holds the divisor.
	//
	// __divtf3 is an alternate symbol name for backward compatibility.

	.text
	.align 16
	.global __divxf3
	.global __divtf3
	.proc __divxf3
	__divxf3:
	__divtf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
	(p6) cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
	(p6) fnma.s1 f11 = farg1, f10, f1
	(p6) fma.s1 f12 = farg0, f10, f0
	;;
	(p6) fma.s1 f13 = f11, f11, f0
	(p6) fma.s1 f14 = f11, f11, f11
	;;
	(p6) fma.s1 f11 = f13, f13, f11
	(p6) fma.s1 f13 = f14, f10, f10
	;;
	(p6) fma.s1 f10 = f13, f11, f10
	(p6) fnma.s1 f11 = farg1, f12, farg0
	;;
	(p6) fma.s1 f11 = f11, f10, f12
	(p6) fnma.s1 f12 = farg1, f10, f1
	;;
	(p6) fma.s1 f10 = f12, f10, f10
	(p6) fnma.s1 f12 = farg1, f11, farg0
	;;
	(p6) fma.s0 fret0 = f12, f10, f11
	(p7) mov fret0 = f10
	br.ret.sptk rp
	.endp __divxf3
	#endif

	#ifdef L__divdf3
	// Compute a 64-bit IEEE double quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// farg0 holds the dividend. farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
	__divdf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
	(p6) cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
	(p6) fmpy.s1 f11 = farg0, f10
	(p6) fnma.s1 f12 = farg1, f10, f1
	;;
	(p6) fma.s1 f11 = f12, f11, f11
	(p6) fmpy.s1 f13 = f12, f12
	;;
	(p6) fma.s1 f10 = f12, f10, f10
	(p6) fma.s1 f11 = f13, f11, f11
	;;
	(p6) fmpy.s1 f12 = f13, f13
	(p6) fma.s1 f10 = f13, f10, f10
	;;
	(p6) fma.d.s1 f11 = f12, f11, f11
	(p6) fma.s1 f10 = f12, f10, f10
	;;
	(p6) fnma.d.s1 f8 = farg1, f11, farg0
	;;
	(p6) fma.d fret0 = f8, f10, f11
	(p7) mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
	#endif

	#ifdef L__divsf3
	// Compute a 32-bit IEEE float quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// farg0 holds the dividend. farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
	__divsf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
	(p6) cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
	(p6) fmpy.s1 f8 = farg0, f10
	(p6) fnma.s1 f9 = farg1, f10, f1
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fmpy.s1 f9 = f9, f9
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fmpy.s1 f9 = f9, f9
	;;
	(p6) fma.d.s1 f10 = f9, f8, f8
	;;
	(p6) fnorm.s.s0 fret0 = f10
	(p7) mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
	#endif

	#ifdef L__divdi3
	// Compute a 64-bit integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
	__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	(p7) break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fnma.s1 f11 = f9, f10, f1
	(p6) fmpy.s1 f12 = f8, f10
	;;
	(p6) fmpy.s1 f13 = f11, f11
	(p6) fma.s1 f12 = f11, f12, f12
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdi3
	#endif

	#ifdef L__moddi3
	// Compute a 64-bit integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend (a). in1 holds the divisor (b).

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
	__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f14
	fcvt.xf f9 = f9
	(p7) break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f11 = f9, f10, f1
	;;
	(p6) fma.s1 f12 = f11, f12, f12
	(p6) fmpy.s1 f13 = f11, f11
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __moddi3
	#endif

	#ifdef L__udivdi3
	// Compute a 64-bit unsigned integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
	__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
	(p7) break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fnma.s1 f11 = f9, f10, f1
	(p6) fmpy.s1 f12 = f8, f10
	;;
	(p6) fmpy.s1 f13 = f11, f11
	(p6) fma.s1 f12 = f11, f12, f12
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivdi3
	#endif

	#ifdef L__umoddi3
	// Compute a 64-bit unsigned integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend (a). in1 holds the divisor (b).

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
	__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf.s1 f8 = f14
	fcvt.xuf.s1 f9 = f9
	(p7) break 1;
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f11 = f9, f10, f1
	;;
	(p6) fma.s1 f12 = f11, f12, f12
	(p6) fmpy.s1 f13 = f11, f11
	;;
	(p6) fma.s1 f10 = f11, f10, f10
	(p6) fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
	(p6) fma.s1 f10 = f13, f10, f10
	(p6) fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umoddi3
	#endif

	#ifdef L__divsi3
	// Compute a 32-bit integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
	__divsi3:
	.regstk 2,0,0,0
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	(p7) break 1
	;;
	mov r2 = 0x0ffdd
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
	(p6) fmpy.s1 f8 = f8, f10
	(p6) fnma.s1 f9 = f9, f10, f1
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fma.s1 f9 = f9, f9, f11
	;;
	(p6) fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsi3
	#endif

	#ifdef L__modsi3
	// Compute a 32-bit integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
	__modsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f13 = r32
	setf.sig f9 = r33
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	(p7) break 1
	;;
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f12 = f10, f12, f12
	(p6) fma.s1 f10 = f10, f10, f11
	;;
	(p6) fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __modsi3
	#endif

	#ifdef L__udivsi3
	// Compute a 32-bit unsigned integer quotient.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
	__udivsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	(p7) break 1
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
	(p6) fmpy.s1 f8 = f8, f10
	(p6) fnma.s1 f9 = f9, f10, f1
	;;
	(p6) fma.s1 f8 = f9, f8, f8
	(p6) fma.s1 f9 = f9, f9, f11
	;;
	(p6) fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivsi3
	#endif

	#ifdef L__umodsi3
	// Compute a 32-bit unsigned integer modulus.
	//
	// From the Intel IA-64 Optimization Guide, choose the minimum latency
	// alternative.
	//
	// in0 holds the dividend. in1 holds the divisor.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
	__umodsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f13 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	(p7) break 1;
	;;
	(p6) fmpy.s1 f12 = f8, f10
	(p6) fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
	(p6) fma.s1 f12 = f10, f12, f12
	(p6) fma.s1 f10 = f10, f10, f11
	;;
	(p6) fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umodsi3
	#endif

	#ifdef L__save_stack_nonlocal
	// Notes on save/restore stack nonlocal: We read ar.bsp but write
	// ar.bspstore. This is because ar.bsp can be read at all times
	// (independent of the RSE mode) but since it's read-only we need to
	// restore the value via ar.bspstore. This is OK because
	// ar.bsp==ar.bspstore after executing "flushrs".

	// void __ia64_save_stack_nonlocal(void save_area, void stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
	__ia64_save_stack_nonlocal:
	{ .mmf
	alloc r18 = ar.pfs, 2, 0, 0, 0
	mov r19 = ar.rsc
	;;
	}
	{ .mmi
	flushrs
	st8 [in0] = in1, 24
	and r19 = 0x1c, r19
	;;
	}
	{ .mmi
	st8 [in0] = r18, -16
	mov ar.rsc = r19
	or r19 = 0x3, r19
	;;
	}
	{ .mmi
	mov r16 = ar.bsp
	mov r17 = ar.rnat
	adds r2 = 8, in0
	;;
	}
	{ .mmi
	st8 [in0] = r16
	st8 [r2] = r17
	}
	{ .mib
	mov ar.rsc = r19
	br.ret.sptk.few rp
	;;
	}
	.endp __ia64_save_stack_nonlocal
	#endif

	#ifdef L__nonlocal_goto
	// void __ia64_nonlocal_goto(void target_label, void save_area,
	// void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
	__ia64_nonlocal_goto:
	{ .mmi
	alloc r20 = ar.pfs, 3, 0, 0, 0
	ld8 r12 = [in1], 8
	mov.ret.sptk rp = in0, .L0
	;;
	}
	{ .mmf
	ld8 r16 = [in1], 8
	mov r19 = ar.rsc
	;;
	}
	{ .mmi
	flushrs
	ld8 r17 = [in1], 8
	and r19 = 0x1c, r19
	;;
	}
	{ .mmi
	ld8 r18 = [in1]
	mov ar.rsc = r19
	or r19 = 0x3, r19
	;;
	}
	{ .mmi
	mov ar.bspstore = r16
	;;
	mov ar.rnat = r17
	;;
	}
	{ .mmi
	loadrs
	invala
	mov r15 = in2
	;;
	}
	.L0: { .mib
	mov ar.rsc = r19
	mov ar.pfs = r18
	br.ret.sptk.few rp
	;;
	}
	.endp __ia64_nonlocal_goto
	#endif

	#ifdef L__restore_stack_nonlocal
	// This is mostly the same as nonlocal_goto above.
	// ??? This has not been tested yet.

	// void __ia64_restore_stack_nonlocal(void *save_area)

	.text
	.align 16
	.global __ia64_restore_stack_nonlocal
	.proc __ia64_restore_stack_nonlocal
	__ia64_restore_stack_nonlocal:
	{ .mmf
	alloc r20 = ar.pfs, 4, 0, 0, 0
	ld8 r12 = [in0], 8
	;;
	}
	{ .mmb
	ld8 r16=[in0], 8
	mov r19 = ar.rsc
	;;
	}
	{ .mmi
	flushrs
	ld8 r17 = [in0], 8
	and r19 = 0x1c, r19
	;;
	}
	{ .mmf
	ld8 r18 = [in0]
	mov ar.rsc = r19
	;;
	}
	{ .mmi
	mov ar.bspstore = r16
	;;
	mov ar.rnat = r17
	or r19 = 0x3, r19
	;;
	}
	{ .mmf
	loadrs
	invala
	;;
	}
	.L0: { .mib
	mov ar.rsc = r19
	mov ar.pfs = r18
	br.ret.sptk.few rp
	;;
	}
	.endp __ia64_restore_stack_nonlocal
	#endif

	#ifdef L__trampoline
	// Implement the nested function trampoline. This is out of line
	// so that we don't have to bother with flushing the icache, as
	// well as making the on-stack trampoline smaller.
	//
	// The trampoline has the following form:
	//
	// +-------------------+ >
	// TRAMP: \| __ia64_trampoline \| \|
	// +-------------------+ > fake function descriptor
	// \| TRAMP+16 \| \|
	// +-------------------+ >
	// \| target descriptor \|
	// +-------------------+
	// \| static link \|
	// +-------------------+

	.text
	.align 16
	.global __ia64_trampoline
	.proc __ia64_trampoline
	__ia64_trampoline:
	{ .mmi
	ld8 r2 = [r1], 8
	;;
	ld8 r15 = [r1]
	}
	{ .mmi
	ld8 r3 = [r2], 8
	;;
	ld8 r1 = [r2]
	mov b6 = r3
	}
	{ .bbb
	br.sptk.many b6
	;;
	}
	.endp __ia64_trampoline
	#endif

	// Thunks for backward compatibility.
	#ifdef L_fixtfdi
	.text
	.align 16
	.global __fixtfti
	.proc __fixtfti
	__fixtfti:
	{ .bbb
	br.sptk.many __fixxfti
	;;
	}
	.endp __fixtfti
	#endif
	#ifdef L_fixunstfdi
	.align 16
	.global __fixunstfti
	.proc __fixunstfti
	__fixunstfti:
	{ .bbb
	br.sptk.many __fixunsxfti
	;;
	}
	.endp __fixunstfti
	#endif
	#if L_floatditf
	.align 16
	.global __floattitf
	.proc __floattitf
	__floattitf:
	{ .bbb
	br.sptk.many __floattixf
	;;
	}
	.endp __floattitf
	#endif