compiler-rt/lib/builtins/hexagon/dfmul.S - llvm-project - Git at Google

 //===----------------------Hexagon builtin routine ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//

 // Double Precision Multiply
 #define A r1:0
 #define AH r1
 #define AL r0
 #define B r3:2
 #define BH r3
 #define BL r2

 #define BTMP r5:4
 #define BTMPH r5
 #define BTMPL r4

 #define PP_ODD r7:6
 #define PP_ODD_H r7
 #define PP_ODD_L r6

 #define ONE r9:8
 #define S_ONE r8
 #define S_ZERO r9

 #define PP_HH r11:10
 #define PP_HH_H r11
 #define PP_HH_L r10

 #define ATMP r13:12
 #define ATMPH r13
 #define ATMPL r12

 #define PP_LL r15:14
 #define PP_LL_H r15
 #define PP_LL_L r14

 #define TMP r28

 #define MANTBITS 52
 #define HI_MANTBITS 20
 #define EXPBITS 11
 #define BIAS 1024
 #define MANTISSA_TO_INT_BIAS 52

 // Some constant to adjust normalization amount in error code
 // Amount to right shift the partial product to get to a denorm
 #define FUDGE 5

 #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
 #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
 #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
 #define END(TAG) .size TAG,.-TAG

 #define SR_ROUND_OFF 22
 	.text
 	.global __hexagon_muldf3
 	.type __hexagon_muldf3,@function
 	Q6_ALIAS(muldf3)
   FAST_ALIAS(muldf3)
   FAST2_ALIAS(muldf3)
 	.p2align 5
 __hexagon_muldf3:
 	{
 		p0 = dfclass(A,#2)
 		p0 = dfclass(B,#2)
 		ATMP = combine(##0x40000000,#0)
 	}
 	{
 		ATMP = insert(A,#MANTBITS,#EXPBITS-1)
 		BTMP = asl(B,#EXPBITS-1)
 		TMP = #-BIAS
 		ONE = #1
 	}
 	{
 		PP_ODD = mpyu(BTMPL,ATMPH)
 		BTMP = insert(ONE,#2,#62)
 	}
 	// since we know that the MSB of the H registers is zero, we should never carry
 	// H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1
 	// Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
 	// Therefore, we can add 3 2^32-1 values safely without carry.  We only need one.
 	{
 		PP_LL = mpyu(ATMPL,BTMPL)
 		PP_ODD += mpyu(ATMPL,BTMPH)
 	}
 	{
 		PP_ODD += lsr(PP_LL,#32)
 		PP_HH = mpyu(ATMPH,BTMPH)
 		BTMP = combine(##BIAS+BIAS-4,#0)
 	}
 	{
 		PP_HH += lsr(PP_ODD,#32)
 		if (!p0) jump .Lmul_abnormal
 		p1 = cmp.eq(PP_LL_L,#0)		// 64 lsb's 0?
 		p1 = cmp.eq(PP_ODD_L,#0)	// 64 lsb's 0?
 	}

 	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
 	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so

 #undef PP_ODD
 #undef PP_ODD_H
 #undef PP_ODD_L
 #define EXP10 r7:6
 #define EXP1 r7
 #define EXP0 r6
 	{
 		if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
 		EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
 		EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
 	}
 	{
 		PP_LL = neg(PP_HH)
 		EXP0 += add(TMP,EXP1)
 		TMP = xor(AH,BH)
 	}
 	{
 		if (!p2.new) PP_HH = PP_LL
 		p2 = cmp.gt(TMP,#-1)
 		p0 = !cmp.gt(EXP0,BTMPH)
 		p0 = cmp.gt(EXP0,BTMPL)
 		if (!p0.new) jump:nt .Lmul_ovf_unf
 	}
 	{
 		A = convert_d2df(PP_HH)
 		EXP0 = add(EXP0,#-BIAS-58)
 	}
 	{
 		AH += asl(EXP0,#HI_MANTBITS)
 		jumpr r31
 	}

 	.falign
 .Lpossible_unf:
 	// We end up with a positive exponent
 	// But we may have rounded up to an exponent of 1.
 	// If the exponent is 1, if we rounded up to it
 	// we need to also raise underflow
 	// Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
 	// And the PP should also have more than one bit set
 	//
 	// Note: ATMP should have abs(PP_HH)
 	// Note: BTMPL should have 0x7FEFFFFF
 	{
 		p0 = cmp.eq(AL,#0)
 		p0 = bitsclr(AH,BTMPL)
 		if (!p0.new) jumpr:t r31
 		BTMPH = #0x7fff
 	}
 	{
 		p0 = bitsset(ATMPH,BTMPH)
 		BTMPL = USR
 		BTMPH = #0x030
 	}
 	{
 		if (p0) BTMPL = or(BTMPL,BTMPH)
 	}
 	{
 		USR = BTMPL
 	}
 	{
 		p0 = dfcmp.eq(A,A)
 		jumpr r31
 	}
 	.falign
 .Lmul_ovf_unf:
 	{
 		A = convert_d2df(PP_HH)
 		ATMP = abs(PP_HH)			// take absolute value
 		EXP1 = add(EXP0,#-BIAS-58)
 	}
 	{
 		AH += asl(EXP1,#HI_MANTBITS)
 		EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
 		BTMPL = ##0x7FEFFFFF
 	}
 	{
 		EXP1 += add(EXP0,##-BIAS-58)
 		//BTMPH = add(clb(ATMP),#-2)
 		BTMPH = #0
 	}
 	{
 		p0 = cmp.gt(EXP1,##BIAS+BIAS-2)	// overflow
 		if (p0.new) jump:nt .Lmul_ovf
 	}
 	{
 		p0 = cmp.gt(EXP1,#0)
 		if (p0.new) jump:nt .Lpossible_unf
 		BTMPH = sub(EXP0,BTMPH)
 		TMP = #63				// max amount to shift
 	}
 	// Underflow
 	//
 	// PP_HH has the partial product with sticky LSB.
 	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
 	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
 	// The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
 	// That's the exponent that happens after the normalization
 	//
 	// EXP0 has the exponent that, when added to the normalized value, is out of range.
 	//
 	// Strategy:
 	//
 	// * Shift down bits, with sticky bit, such that the bits are aligned according
 	//   to the LZ count and appropriate exponent, but not all the way to mantissa
 	//   field, keep around the last few bits.
 	// * Put a 1 near the MSB
 	// * Check the LSBs for inexact; if inexact also set underflow
 	// * Convert [u]d2df -- will correctly round according to rounding mode
 	// * Replace exponent field with zero

 	{
 		BTMPL = #0	 			// offset for extract
 		BTMPH = sub(#FUDGE,BTMPH)		// amount to right shift
 	}
 	{
 		p3 = cmp.gt(PP_HH_H,#-1)		// is it positive?
 		BTMPH = min(BTMPH,TMP)			// Don't shift more than 63
 		PP_HH = ATMP
 	}
 	{
 		TMP = USR
 		PP_LL = extractu(PP_HH,BTMP)
 	}
 	{
 		PP_HH = asr(PP_HH,BTMPH)
 		BTMPL = #0x0030					// underflow flag
 		AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
 	}
 	{
 		p0 = cmp.gtu(ONE,PP_LL)				// Did we extract all zeros?
 		if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)	// add sticky bit
 		PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)	// Add back in a bit so we can use convert instruction
 	}
 	{
 		PP_LL = neg(PP_HH)
 		p1 = bitsclr(PP_HH_L,#0x7)		// Are the LSB's clear?
 		if (!p1.new) TMP = or(BTMPL,TMP)	// If not, Inexact+Underflow
 	}
 	{
 		if (!p3) PP_HH = PP_LL
 		USR = TMP
 	}
 	{
 		A = convert_d2df(PP_HH)			// Do rounding
 		p0 = dfcmp.eq(A,A)			// realize exception
 	}
 	{
 		AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)		// Insert correct exponent
 		jumpr r31
 	}
 	.falign
 .Lmul_ovf:
 	// We get either max finite value or infinity.  Either way, overflow+inexact
 	{
 		TMP = USR
 		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
 		A = PP_HH
 	}
 	{
 		PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
 		TMP = or(TMP,#0x28)			// inexact + overflow
 		BTMP = combine(##0x7ff00000,#0)		// positive infinity
 	}
 	{
 		USR = TMP
 		PP_LL_L ^= lsr(AH,#31)			// Does sign match rounding?
 		TMP = PP_LL_L				// unmodified rounding mode
 	}
 	{
 		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
 		p0 = !cmp.eq(PP_LL_L,#2)		// Not rounding the other way,
 		if (p0.new) ATMP = BTMP			// we should get infinity
 		p0 = dfcmp.eq(A,A)			// Realize FP exception if enabled
 	}
 	{
 		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
 		jumpr r31
 	}

 .Lmul_abnormal:
 	{
 		ATMP = extractu(A,#63,#0)		// strip off sign
 		BTMP = extractu(B,#63,#0)		// strip off sign
 	}
 	{
 		p3 = cmp.gtu(ATMP,BTMP)
 		if (!p3.new) A = B			// sort values
 		if (!p3.new) B = A			// sort values
 	}
 	{
 		// Any NaN --> NaN, possibly raise invalid if sNaN
 		p0 = dfclass(A,#0x0f)		// A not NaN?
 		if (!p0.new) jump:nt .Linvalid_nan
 		if (!p3) ATMP = BTMP
 		if (!p3) BTMP = ATMP
 	}
 	{
 		// Infinity * nonzero number is infinity
 		p1 = dfclass(A,#0x08)		// A is infinity
 		p1 = dfclass(B,#0x0e)		// B is nonzero
 	}
 	{
 		// Infinity * zero --> NaN, raise invalid
 		// Other zeros return zero
 		p0 = dfclass(A,#0x08)		// A is infinity
 		p0 = dfclass(B,#0x01)		// B is zero
 	}
 	{
 		if (p1) jump .Ltrue_inf
 		p2 = dfclass(B,#0x01)
 	}
 	{
 		if (p0) jump .Linvalid_zeroinf
 		if (p2) jump .Ltrue_zero		// so return zero
 		TMP = ##0x7c000000
 	}
 	// We are left with a normal or subnormal times a subnormal. A > B
 	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
 	// we go to a single sticky bit, which we can round easily.
 	// If A and B might multiply to something bigger, decrease A exponent and increase
 	// B exponent and try again
 	{
 		p0 = bitsclr(AH,TMP)
 		if (p0.new) jump:nt .Lmul_tiny
 	}
 	{
 		TMP = cl0(BTMP)
 	}
 	{
 		TMP = add(TMP,#-EXPBITS)
 	}
 	{
 		BTMP = asl(BTMP,TMP)
 	}
 	{
 		B = insert(BTMP,#63,#0)
 		AH -= asl(TMP,#HI_MANTBITS)
 	}
 	jump __hexagon_muldf3
 .Lmul_tiny:
 	{
 		TMP = USR
 		A = xor(A,B)				// get sign bit
 	}
 	{
 		TMP = or(TMP,#0x30)			// Inexact + Underflow
 		A = insert(ONE,#63,#0)			// put in rounded up value
 		BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)	// get rounding mode
 	}
 	{
 		USR = TMP
 		p0 = cmp.gt(BTMPH,#1)			// Round towards pos/neg inf?
 		if (!p0.new) AL = #0			// If not, zero
 		BTMPH ^= lsr(AH,#31)			// rounding my way --> set LSB
 	}
 	{
 		p0 = cmp.eq(BTMPH,#3)			// if rounding towards right inf
 		if (!p0.new) AL = #0			// don't go to zero
 		jumpr r31
 	}
 .Linvalid_zeroinf:
 	{
 		TMP = USR
 	}
 	{
 		A = #-1
 		TMP = or(TMP,#2)
 	}
 	{
 		USR = TMP
 	}
 	{
 		p0 = dfcmp.uo(A,A)			// force exception if enabled
 		jumpr r31
 	}
 .Linvalid_nan:
 	{
 		p0 = dfclass(B,#0x0f)			// if B is not NaN
 		TMP = convert_df2sf(A)			// will generate invalid if sNaN
 		if (p0.new) B = A 			// make it whatever A is
 	}
 	{
 		BL = convert_df2sf(B)			// will generate invalid if sNaN
 		A = #-1
 		jumpr r31
 	}
 	.falign
 .Ltrue_zero:
 	{
 		A = B
 		B = A
 	}
 .Ltrue_inf:
 	{
 		BH = extract(BH,#1,#31)
 	}
 	{
 		AH ^= asl(BH,#31)
 		jumpr r31
 	}
 END(__hexagon_muldf3)

 #undef ATMP
 #undef ATMPL
 #undef ATMPH
 #undef BTMP
 #undef BTMPL
 #undef BTMPH
	//===----------------------Hexagon builtin routine ------------------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//

	// Double Precision Multiply
	#define A r1:0
	#define AH r1
	#define AL r0
	#define B r3:2
	#define BH r3
	#define BL r2

	#define BTMP r5:4
	#define BTMPH r5
	#define BTMPL r4

	#define PP_ODD r7:6
	#define PP_ODD_H r7
	#define PP_ODD_L r6

	#define ONE r9:8
	#define S_ONE r8
	#define S_ZERO r9

	#define PP_HH r11:10
	#define PP_HH_H r11
	#define PP_HH_L r10

	#define ATMP r13:12
	#define ATMPH r13
	#define ATMPL r12

	#define PP_LL r15:14
	#define PP_LL_H r15
	#define PP_LL_L r14

	#define TMP r28

	#define MANTBITS 52
	#define HI_MANTBITS 20
	#define EXPBITS 11
	#define BIAS 1024
	#define MANTISSA_TO_INT_BIAS 52

	// Some constant to adjust normalization amount in error code
	// Amount to right shift the partial product to get to a denorm
	#define FUDGE 5

	#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
	#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
	#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
	#define END(TAG) .size TAG,.-TAG

	#define SR_ROUND_OFF 22
	.text
	.global __hexagon_muldf3
	.type __hexagon_muldf3,@function
	Q6_ALIAS(muldf3)
	FAST_ALIAS(muldf3)
	FAST2_ALIAS(muldf3)
	.p2align 5
	__hexagon_muldf3:
	{
	p0 = dfclass(A,#2)
	p0 = dfclass(B,#2)
	ATMP = combine(##0x40000000,#0)
	}
	{
	ATMP = insert(A,#MANTBITS,#EXPBITS-1)
	BTMP = asl(B,#EXPBITS-1)
	TMP = #-BIAS
	ONE = #1
	}
	{
	PP_ODD = mpyu(BTMPL,ATMPH)
	BTMP = insert(ONE,#2,#62)
	}
	// since we know that the MSB of the H registers is zero, we should never carry
	// H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1
	// Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
	// Therefore, we can add 3 2^32-1 values safely without carry. We only need one.
	{
	PP_LL = mpyu(ATMPL,BTMPL)
	PP_ODD += mpyu(ATMPL,BTMPH)
	}
	{
	PP_ODD += lsr(PP_LL,#32)
	PP_HH = mpyu(ATMPH,BTMPH)
	BTMP = combine(##BIAS+BIAS-4,#0)
	}
	{
	PP_HH += lsr(PP_ODD,#32)
	if (!p0) jump .Lmul_abnormal
	p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
	p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
	}

	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so

	#undef PP_ODD
	#undef PP_ODD_H
	#undef PP_ODD_L
	#define EXP10 r7:6
	#define EXP1 r7
	#define EXP0 r6
	{
	if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
	EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
	EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
	}
	{
	PP_LL = neg(PP_HH)
	EXP0 += add(TMP,EXP1)
	TMP = xor(AH,BH)
	}
	{
	if (!p2.new) PP_HH = PP_LL
	p2 = cmp.gt(TMP,#-1)
	p0 = !cmp.gt(EXP0,BTMPH)
	p0 = cmp.gt(EXP0,BTMPL)
	if (!p0.new) jump:nt .Lmul_ovf_unf
	}
	{
	A = convert_d2df(PP_HH)
	EXP0 = add(EXP0,#-BIAS-58)
	}
	{
	AH += asl(EXP0,#HI_MANTBITS)
	jumpr r31
	}

	.falign
	.Lpossible_unf:
	// We end up with a positive exponent
	// But we may have rounded up to an exponent of 1.
	// If the exponent is 1, if we rounded up to it
	// we need to also raise underflow
	// Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
	// And the PP should also have more than one bit set
	//
	// Note: ATMP should have abs(PP_HH)
	// Note: BTMPL should have 0x7FEFFFFF
	{
	p0 = cmp.eq(AL,#0)
	p0 = bitsclr(AH,BTMPL)
	if (!p0.new) jumpr:t r31
	BTMPH = #0x7fff
	}
	{
	p0 = bitsset(ATMPH,BTMPH)
	BTMPL = USR
	BTMPH = #0x030
	}
	{
	if (p0) BTMPL = or(BTMPL,BTMPH)
	}
	{
	USR = BTMPL
	}
	{
	p0 = dfcmp.eq(A,A)
	jumpr r31
	}
	.falign
	.Lmul_ovf_unf:
	{
	A = convert_d2df(PP_HH)
	ATMP = abs(PP_HH) // take absolute value
	EXP1 = add(EXP0,#-BIAS-58)
	}
	{
	AH += asl(EXP1,#HI_MANTBITS)
	EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
	BTMPL = ##0x7FEFFFFF
	}
	{
	EXP1 += add(EXP0,##-BIAS-58)
	//BTMPH = add(clb(ATMP),#-2)
	BTMPH = #0
	}
	{
	p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
	if (p0.new) jump:nt .Lmul_ovf
	}
	{
	p0 = cmp.gt(EXP1,#0)
	if (p0.new) jump:nt .Lpossible_unf
	BTMPH = sub(EXP0,BTMPH)
	TMP = #63 // max amount to shift
	}
	// Underflow
	//
	// PP_HH has the partial product with sticky LSB.
	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
	// The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
	// That's the exponent that happens after the normalization
	//
	// EXP0 has the exponent that, when added to the normalized value, is out of range.
	//
	// Strategy:
	//
	// * Shift down bits, with sticky bit, such that the bits are aligned according
	// to the LZ count and appropriate exponent, but not all the way to mantissa
	// field, keep around the last few bits.
	// * Put a 1 near the MSB
	// * Check the LSBs for inexact; if inexact also set underflow
	// * Convert [u]d2df -- will correctly round according to rounding mode
	// * Replace exponent field with zero

	{
	BTMPL = #0 // offset for extract
	BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
	}
	{
	p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
	BTMPH = min(BTMPH,TMP) // Don't shift more than 63
	PP_HH = ATMP
	}
	{
	TMP = USR
	PP_LL = extractu(PP_HH,BTMP)
	}
	{
	PP_HH = asr(PP_HH,BTMPH)
	BTMPL = #0x0030 // underflow flag
	AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
	}
	{
	p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
	if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
	PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
	}
	{
	PP_LL = neg(PP_HH)
	p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
	if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
	}
	{
	if (!p3) PP_HH = PP_LL
	USR = TMP
	}
	{
	A = convert_d2df(PP_HH) // Do rounding
	p0 = dfcmp.eq(A,A) // realize exception
	}
	{
	AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
	jumpr r31
	}
	.falign
	.Lmul_ovf:
	// We get either max finite value or infinity. Either way, overflow+inexact
	{
	TMP = USR
	ATMP = combine(##0x7fefffff,#-1) // positive max finite
	A = PP_HH
	}
	{
	PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
	TMP = or(TMP,#0x28) // inexact + overflow
	BTMP = combine(##0x7ff00000,#0) // positive infinity
	}
	{
	USR = TMP
	PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
	TMP = PP_LL_L // unmodified rounding mode
	}
	{
	p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
	p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
	if (p0.new) ATMP = BTMP // we should get infinity
	p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
	}
	{
	A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
	jumpr r31
	}

	.Lmul_abnormal:
	{
	ATMP = extractu(A,#63,#0) // strip off sign
	BTMP = extractu(B,#63,#0) // strip off sign
	}
	{
	p3 = cmp.gtu(ATMP,BTMP)
	if (!p3.new) A = B // sort values
	if (!p3.new) B = A // sort values
	}
	{
	// Any NaN --> NaN, possibly raise invalid if sNaN
	p0 = dfclass(A,#0x0f) // A not NaN?
	if (!p0.new) jump:nt .Linvalid_nan
	if (!p3) ATMP = BTMP
	if (!p3) BTMP = ATMP
	}
	{
	// Infinity * nonzero number is infinity
	p1 = dfclass(A,#0x08) // A is infinity
	p1 = dfclass(B,#0x0e) // B is nonzero
	}
	{
	// Infinity * zero --> NaN, raise invalid
	// Other zeros return zero
	p0 = dfclass(A,#0x08) // A is infinity
	p0 = dfclass(B,#0x01) // B is zero
	}
	{
	if (p1) jump .Ltrue_inf
	p2 = dfclass(B,#0x01)
	}
	{
	if (p0) jump .Linvalid_zeroinf
	if (p2) jump .Ltrue_zero // so return zero
	TMP = ##0x7c000000
	}
	// We are left with a normal or subnormal times a subnormal. A > B
	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
	// we go to a single sticky bit, which we can round easily.
	// If A and B might multiply to something bigger, decrease A exponent and increase
	// B exponent and try again
	{
	p0 = bitsclr(AH,TMP)
	if (p0.new) jump:nt .Lmul_tiny
	}
	{
	TMP = cl0(BTMP)
	}
	{
	TMP = add(TMP,#-EXPBITS)
	}
	{
	BTMP = asl(BTMP,TMP)
	}
	{
	B = insert(BTMP,#63,#0)
	AH -= asl(TMP,#HI_MANTBITS)
	}
	jump __hexagon_muldf3
	.Lmul_tiny:
	{
	TMP = USR
	A = xor(A,B) // get sign bit
	}
	{
	TMP = or(TMP,#0x30) // Inexact + Underflow
	A = insert(ONE,#63,#0) // put in rounded up value
	BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
	}
	{
	USR = TMP
	p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
	if (!p0.new) AL = #0 // If not, zero
	BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
	}
	{
	p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
	if (!p0.new) AL = #0 // don't go to zero
	jumpr r31
	}
	.Linvalid_zeroinf:
	{
	TMP = USR
	}
	{
	A = #-1
	TMP = or(TMP,#2)
	}
	{
	USR = TMP
	}
	{
	p0 = dfcmp.uo(A,A) // force exception if enabled
	jumpr r31
	}
	.Linvalid_nan:
	{
	p0 = dfclass(B,#0x0f) // if B is not NaN
	TMP = convert_df2sf(A) // will generate invalid if sNaN
	if (p0.new) B = A // make it whatever A is
	}
	{
	BL = convert_df2sf(B) // will generate invalid if sNaN
	A = #-1
	jumpr r31
	}
	.falign
	.Ltrue_zero:
	{
	A = B
	B = A
	}
	.Ltrue_inf:
	{
	BH = extract(BH,#1,#31)
	}
	{
	AH ^= asl(BH,#31)
	jumpr r31
	}
	END(__hexagon_muldf3)

	#undef ATMP
	#undef ATMPL
	#undef ATMPH
	#undef BTMP
	#undef BTMPL
	#undef BTMPH