lib/builtins/arm/muldf3.S - llvm-project/compiler-rt - Git at Google

 //===-- muldf3.S - double-precision floating point multiplication ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the __muldf3 function (double precision floating point
 // multiplication), with the IEEE-754 default rounding (to nearest, ties to
 // even), for the Arm and Thumb2 ISAs.
 //
 //===----------------------------------------------------------------------===//

 #include "../assembly.h"
 #include "crt_endian.h"

   .syntax unified
   .text
   .p2align 2

 #if __ARM_PCS_VFP
 DEFINE_COMPILERRT_FUNCTION(__muldf3)
   push {r4, lr}
   VMOV_FROM_DOUBLE(r0, r1, d0)
   VMOV_FROM_DOUBLE(r2, r3, d1)
   bl __aeabi_dmul
   VMOV_TO_DOUBLE(d0, r0, r1)
   pop {r4, pc}
 #else
 DEFINE_COMPILERRT_FUNCTION_ALIAS(__muldf3, __aeabi_dmul)
 #endif

 DEFINE_COMPILERRT_FUNCTION(__aeabi_dmul)

   push    {r4,r5,r6,lr}

   // Check if either input exponent is 000 or 7FF (i.e. not a normalized
   // number), and if so, branch out of line. If we don't branch out of line,
   // then we've also extracted the exponents of the input values x and y into
   // bits 16..26 of r14 and r5 respectively. But if we do, then that hasn't
   // necessarily been done (because the second AND might have been skipped).
   ldr     r12, =0x07FF0000
   ands    r14, r12, xh, lsr #4 // sets Z if exponent of x is 0
   andsne  r5, r12, yh, lsr #4  // otherwise, sets Z if exponent of y is 0
   teqne   r14, r12             // otherwise, sets Z if exponent of x is 7FF
   teqne   r5, r12              // otherwise, sets Z if exponent of y is 7FF
   beq     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm

   // Calculate the sign of the result, and put it in an unused bit of r14.
   eor     r4, xh, yh           // XOR the input signs to get the result sign
   orr     r14, r14, r4, lsr #31 // save it in the low bit of r14

   // Clear the exponent and sign bits from the top word of each mantissa, and
   // set the leading mantissa bit in each one, so that they're in the right
   // form to be multiplied.
   bic     xh, xh, r12, lsl #5     // r12 = 0x07FF0000, so r12 << 5 = 0xFF800000
   bic     yh, yh, r12, lsl #5
   orr     xh, xh, #(1 << 20)
   orr     yh, yh, #(1 << 20)

   // Now we're ready to multiply mantissas. This is also the place we'll come
   // back to after decoding denormal inputs. The denormal decoding will also
   // have to set up the same register contents:
   //  - fractions in xh/xl and yh/yl, with leading bits at bit 20 of xh/yh
   //  - exponents in r14 and r5, starting at bit 16
   //  - output sign in r14 bit 0
 LOCAL_LABEL(mul):

   // Multiply the two mantissas as if they were full 64-bit words, delivering a
   // 128-bit output in four registers. We provide three different ways to do
   // this, using different instructions.
   //
   // Interleaved with the multiplication code, we also compute the output
   // exponent by adding the input exponents and rebiasing. This takes two
   // instructions. We schedule each one after a multiplication, to use a delay
   // slot from the multiplication on CPUs where there is one.
   //
   // We add r5 to r14, so that the output exponent is in the top half of r14,
   // and r5 is freed up to be used in the multiplication.
   //
   // We rebias the exponent by subtracting 0x400, which is correct for one of
   // the two places where the leading bit of the product could end up, and will
   // need correcting by one in the other case.
   //
   // Exit conditions from the three-way #if:
   //
   // r4:r5:r6 are the top 96 bits of the 128-bit product, with the leading bit
   // at either bit 8 or bit 9 of r4. The low bit of r6 is forced to 1 if any of
   // the low 32 bits of the 128-bit product were set.
   //
   // The output sign is still in the low bit of r14; the top half contains the
   // preliminary output exponent (yet to be adjusted depending on where the
   // high bit of the product ended up).

 #if __ARM_FEATURE_DSP
   // The UMAAL instruction, which computes a 64-bit product and adds two
   // separate 32-bit values to it, makes this easy.
   umull   r6, r4, xh, yl
   add     r14, r14, r5             // add exponents, freeing up r5
   umull   r12, r5, xl, yl
   sub     r14, r14, #0x4000000     // initial rebiasing of exponent
   umaal   r6, r5, xl, yh
   umaal   r5, r4, xh, yh
 #elif ARM_FP_DMUL_USE_UMLAL
   // The UMLAL instruction computes a 64-bit product and adds a 64-bit value to
   // it. But it doesn't write to the carry flag, so you can't tell if the
   // addition wrapped. Therefore you have to use it in a way that means the
   // addition never wraps. Here we do three of the four multiplications (xl*yl,
   // xl*yh, xh*yh) in a chain, using UMLAL for the top two, in each case with
   // the 64-bit accumulator consisting of the top half of the previous
   // multiplication, and a high word set to zero before the UMLAL instruction.
   //
   // On Cortex-M3, this is not a win over just using UMULL and doing the
   // additions by hand, because UMLAL takes two cycles longer than UMULL, and
   // it also costs a cycle to initialise each of the two high accumulator words
   // to zero. If the high word of the addend were not zero then those two
   // cycles would be doing something useful, but as it is, they're wasted time.
   //
   // CPUs later than Cortex-M3 - in particular, Cortex-M4 - will do both UMLAL
   // and UMULL much faster, so that this code is a win over the plain UMULL
   // code below. But those CPUs typically have UMAAL anyway and will use the
   // even faster version of the code above. So this code is provided in case
   // it's useful, but won't be enabled unless you manually #define
   // ARM_FP_DMUL_USE_UMLAL.
   umull   r12, r6, xl, yl
   add     r14, r14, r5             // add exponents, freeing up r5
   movs    r5, #0
   umlal   r6, r5, xl, yh
   movs    r4, #0
   umlal   r5, r4, xh, yh
   sub     r14, r14, #0x4000000     // initial rebiasing of exponent
   umull   xl, yh, xh, yl
   adds    r6, r6, xl
   adcs    r5, r5, yh
   adc     r4, r4, #0
 #else
   // Simplest approach, using plain UMULL to compute each 64-bit product, and
   // separate ADD and ADC instructions to do the additions. On Cortex-M3 this
   // wins over the UMLAL approach: it's one instruction longer, but three
   // cycles quicker, since each use of UMLAL in the above version costs 2
   // cycles.
   umull   r4, r12, xh, yl
   add     r14, r14, r5             // add exponents, freeing up r5
   umull   r6, r5, xl, yh
   sub     r14, r14, #0x4000000     // initial rebiasing of exponent
   adds    r6, r6, r4
   adcs    r5, r5, r12              // carry from here is used below

   umull   r4, r12, xh, yh          // r12:r4 is top part
   adc     yh, r12, #0              // get carry from above addition
   umull   r12, xh, xl, yl          // xh:r12 is bottom part

   adds    r6, r6, xh
   adcs    r5, r5, r4
   adcs    r4, yh, #0
 #endif

   // Now the full 128-bit product of the two mantissas occupies the four
   // registers r4,r5,r6,r12 (in order from MSW to LSW). Since each input
   // mantissa was in the range [2^52,2^53), the product is in the range
   // [2^104,2^106), which means that the lowest-order word r12 is a long way
   // below the round bit, so that it can only affect cases so close to a
   // rounding boundary that you need to know if it's nonzero to tell whether
   // you're rounding to even. Start by freeing up that register, ensuring the
   // low bit of r6 is set if anything in r12 was nonzero.
   tst     r12, r12
   orrne   r6, r6, #1

   // Now we can regard the result as a 96-bit value in r4,r5,r6, with its
   // leading bit in either bit 8 or 9 of r4. To move that bit up to its final
   // position in bit 20, we must shift the whole thing left by either 11 or 12
   // bits. Find out which.
   tst     r4, #0x200               // is bit 9 set?
   bne     LOCAL_LABEL(shift11)             // if so, only shift by 11 bits

   // In this branch, we're shifting left by 12 bits. Put the shifted result
   // back into the output registers xh,xl, and the bits lower than the bottom
   // mantissa bit into r4.
   lsls    xh, r4, #12              // shift each input reg left 12
   lsls    xl, r5, #12
   lsls    r4, r6, #12
   orr     xh, xh, r5, lsr #20      // and the top two right by 32-12
   orr     xl, xl, r6, lsr #20

   b       LOCAL_LABEL(shifted)

 LOCAL_LABEL(shift11):
   // In this branch, we're shifting left by 11 bits instead of 12, and we must
   // adjust the exponent by 1 to compensate.
   lsls    xh, r4, #11              // shift each input reg left 11
   lsls    xl, r5, #11
   lsls    r4, r6, #11
   orr     xh, xh, r5, lsr #21      // and the top two right by 32-11
   orr     xl, xl, r6, lsr #21
   add     r14, r14, #0x10000       // adjust the exponent

 LOCAL_LABEL(shifted):
   // We've reconverged after shifting the mantissa, so that now the leading 1
   // bit of the mantissa is in bit 20 of xh, and r4 contains the bits lower
   // than the bottom of xl.

   // Recombine the sign and exponent into the high bits of xh. If the exponent
   // is over- or underflowed, this may not give a valid FP result, but because
   // everything is put on by addition, it will be right "mod 2^64" so that we
   // can bias the exponent back into range for underflow handling and that will
   // recover the right sign.
   //
   // r14 still has the output sign in its low bit. To extract just the exponent
   // for adding to xh, we could use BIC to clear that bit, or shift the value
   // right. We do the latter, which saves a copy of the pre-rounding exponent
   // in yl, to use later for overflow detection. The shift is ASR, so that if
   // the exponent is negative due to underflow, it stays negative.
   asr     yl, r14, #16             // isolate the exponent
   add     xh, xh, yl, lsl #20      // shift it back up to add to xh
   add     xh, xh, r14, lsl #31     // then add the sign

   // If we have to handle an underflow, we'll need enough information to
   // reconstruct the rounding direction. Our strategy is
   //
   //  - save the LSW of the output before rounding: if that differs from the
   //    LSW after rounding then we rounded up
   //  - save the round word r4: if that is zero then we didn't round at all.
   //
   // We're going to branch past the rounding code for a quicker exit in the
   // case where we're exact. In that case we don't need to save the output LSW
   // at all, because the zero round word will override whatever it would have
   // been anyway.
   movs    r6, r4                   // unconditionally save round word
   beq     LOCAL_LABEL(rounded)             // branch past rounding code if exact
   mov     r5, xl                   // and if not, save output LSW too

   // Rounding: we shift r4 left to put the round bit into the carry flag so
   // that ADCS+ADC will conditionally increment the mantissa. But before we do
   // the additions, we also check the Z flag, which tells us whether the
   // remaining 31 bits are all zero. If so, we're either in the round-to-even
   // (RTE) halfway case, or the exact case - but the exact case never came
   // through this code at all, so it must be RTE.
   //
   // If those 31 bits _aren't_ all zero, we clear the top bit of r4, leaving it
   // set only in the round-to-even case. Then (r4 >> 31) can be used to clear
   // the low bit to perform RTE.
   lsls    r12, r4, #1              // test round word
   bicne   r4, r4, #0x80000000      // make top bit of r4 into the RTE bit
   adcs    xl, xl, #0               // conditionally increment the mantissa
   adc     xh, xh, #0               // ... and carry into its high word
   bic     xl, xl, r4, lsr #31      // round to even if r4[31] != 0

 LOCAL_LABEL(rounded):
   // Now we've rounded the output. The last thing we must do is check for
   // overflow and underflow: if neither has happened, we can return.
   //
   // yl contains the pre-rounding output exponent minus 1 (so that the leading
   // mantissa bit incremented it to the right output value). If this is in the
   // range [0,0x7fd] then the leading bit would have incremented it to
   // [1,0x7fe], which are non-overflowed output exponents. So an unsigned check
   // if yl >= 0x7fe detects both overflow and underflow at once.
   movw    r12, #0x7FE
   cmp     yl, r12
   poplo   {r4,r5,r6,pc}

   // We have either an underflow or an overflow. We can tell which it is by
   // doing a _signed_ comparison of yl with the same value again - and since we
   // only just did the CMP instruction, we can reuse the same flags.
   bge     LOCAL_LABEL(overflow)

   // Now we're dealing with an underflow. Set r2 to the rounding direction, by
   // first checking xl against r5 (where we saved its pre-rounding value) to
   // see if we rounded up or down, and then overriding that by checking r6
   // (where we saved the round word) to see if we didn't round at all. In the
   // latter case the comparison against r5 will deliver nonsense, but then we
   // overwrite it, so it doesn't matter.
   cmp     xl, r5                   // did we modify the LSW, i.e. round up?
   movne   r2, #-1                  // if so, the true value is a bit smaller
   moveq   r2, #+1                  // else it's a bit bigger
   cmp     r6, #0                   // except maybe we didn't round at all
   moveq   r2, #0                   // in which case the true value is exact.

   // Add the IEEE 754 exponent bias, and tail-call __dunder to handle the rest
   // of the job.
   add     xh, xh, #0x60000000
   pop     {r4,r5,r6,lr}
   b       SYMBOL_NAME(__compiler_rt_dunder)

 LOCAL_LABEL(overflow):
   // Here, we overflowed, so we must return an infinity of the correct sign.
   // Rebias the exponent, which corrects the sign bit.
   sub     xh, xh, #0x60000000

   // And pop our scratch registers before falling through into dmul_retinf.
   pop     {r4,r5,r6,lr}

 LOCAL_LABEL(retinf):
   // This is entered from the overflow handler and also from cases with
   // infinite inputs. It constructs an infinity, with sign bit equal to the
   // high bit of xh.
   //
   // On entry to here, we expect not to have a stack frame any more, because
   // one of our callers will have popped it already in order to conditionally
   // tailcall __dnan2.
   mov     xl, #0                   // clear low word
   mvn     xh, xh, lsr #31          // shift xh[31] down to bit 0, inverted
   mvn     xh, xh, lsl #11          // uninvert, and put exponent 0x7ff below it
   lsl     xh, xh, #20              // shift back up to the top
   bx      lr

 LOCAL_LABEL(uncommon):
   // We come here from the entry point, if any input had exponent 0 or 0x7ff.
   // First we must repeat the instruction from the entry point that sets up r5
   // with the exponent of y, this time unconditionally, so we know we have both
   // exponents in the top halves of r14 and r5.
   and     r5, r12, yh, lsr #4

   // Check if either exponent is 0x7ff, by comparing against the value left in
   // r12 by the entry point. If so, branch away to handle NaNs and infinities.
   teq     r14, r12
   teqne   r5, r12
   beq     LOCAL_LABEL(naninf)

   // If we didn't branch, we're dealing with finite numbers, including a zero
   // or a denormal or both.
   //
   // First save the output sign.
   eor     r6, xh, yh

   // Handle zeroes first, because if there's a zero we don't have to worry
   // about denormals at all.
   orrs    r4, xl, xh, lsl #1      // is x zero?
   orrsne  r4, yl, yh, lsl #1      // or is y zero?
   beq     LOCAL_LABEL(retzero)            // Return zero if so

   // Otherwise, delegate to __dnorm2 to handle denormals, converting them into
   // a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
   // exponents at the bottom of their words instead of half way up, so shift
   // down first, and back up again afterwards.
   //
   // This call clobbers r12, because we didn't bother to save it on the stack.
   // That's fine, because we don't need the constant in it any more. When we go
   // back to dmul_mul, that will use it as a scratch register.
   lsr     r4, r14, #16
   lsr     r5, r5, #16
   push    {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
   mov     r0, sp                   // pass it by address
   bl      SYMBOL_NAME(__compiler_rt_dnorm2)
   pop     {r0, r1, r2, r3, r4, r5}
   lsl     r14, r4, #16
   lsls    r5, r5, #16

   // Put the output sign at the bottom of r14, the same place the fast path
   // would have left it. Then rejoin the fast path.
   orr     r14, r14, r6, lsr #31
   b       LOCAL_LABEL(mul)

 LOCAL_LABEL(retzero):
   // Return an exact zero, with sign bit from the high bit of r6.
   mov     xl, #0                  // low word is 0
   ands    xh, r6, #0x80000000     // high word is 0 except for the sign
   pop     {r4,r5,r6,pc}

 LOCAL_LABEL(naninf):
   // We come here knowing that at least one operand is either NaN or infinity.
   // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
   // stacked registers first: we won't need that much spare space any more, and
   // it makes the tailcall easier if we've already done it.
   pop     {r4,r5,r6,lr}

   // A number is a NaN if its exponent is 0x7ff and at least one bit below that
   // is set. The CMP + ADC pair here converts the two words xh:xl into a single
   // word containing xh shifted up by one (throwing away the sign bit which
   // makes no difference), with its low bit set if xl was nonzero. So if that
   // is strictly greater than 0xffe00000, then x was a NaN.
   cmp     xl, #1
   adc     r12, xh, xh
   cmp     r12, #0xFFE00000
   bhi     SYMBOL_NAME(__compiler_rt_dnan2)
   // Now check y in the same way.
   cmp     yl, #1
   adc     r12, yh, yh
   cmp     r12, #0xFFE00000
   bhi     SYMBOL_NAME(__compiler_rt_dnan2)

   // Now we know there are no NaNs. Therefore there's at least one infinity. If
   // either operand is zero then we have inf * 0 = invalid operation and must
   // return a NaN.
   orrs    r12, xl, xh, lsl #1     // are all bits of x zero except the sign?
   beq     LOCAL_LABEL(retnan)             // if so, x == 0, so y == inf
   orrs    r12, yl, yh, lsl #1     // same check the other way round
   beq     LOCAL_LABEL(retnan)

   // If we have an infinity and no NaN, then we just return an infinity of the
   // correct sign.
   eor     xh, xh, yh
   b       LOCAL_LABEL(retinf)

 LOCAL_LABEL(retnan):
   // Return the default NaN, in the case where the inputs were 0 and infinity.
   movw    xh, 0x7ff8
   lsls    xh, xh, #16
   mov     xl, #0
   bx      lr

 END_COMPILERRT_FUNCTION(__aeabi_dmul)

 NO_EXEC_STACK_DIRECTIVE
	//===-- muldf3.S - double-precision floating point multiplication ---------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the __muldf3 function (double precision floating point
	// multiplication), with the IEEE-754 default rounding (to nearest, ties to
	// even), for the Arm and Thumb2 ISAs.
	//
	//===----------------------------------------------------------------------===//

	#include "../assembly.h"
	#include "crt_endian.h"

	.syntax unified
	.text
	.p2align 2

	#if __ARM_PCS_VFP
	DEFINE_COMPILERRT_FUNCTION(__muldf3)
	push {r4, lr}
	VMOV_FROM_DOUBLE(r0, r1, d0)
	VMOV_FROM_DOUBLE(r2, r3, d1)
	bl __aeabi_dmul
	VMOV_TO_DOUBLE(d0, r0, r1)
	pop {r4, pc}
	#else
	DEFINE_COMPILERRT_FUNCTION_ALIAS(__muldf3, __aeabi_dmul)
	#endif

	DEFINE_COMPILERRT_FUNCTION(__aeabi_dmul)

	push {r4,r5,r6,lr}

	// Check if either input exponent is 000 or 7FF (i.e. not a normalized
	// number), and if so, branch out of line. If we don't branch out of line,
	// then we've also extracted the exponents of the input values x and y into
	// bits 16..26 of r14 and r5 respectively. But if we do, then that hasn't
	// necessarily been done (because the second AND might have been skipped).
	ldr r12, =0x07FF0000
	ands r14, r12, xh, lsr #4 // sets Z if exponent of x is 0
	andsne r5, r12, yh, lsr #4 // otherwise, sets Z if exponent of y is 0
	teqne r14, r12 // otherwise, sets Z if exponent of x is 7FF
	teqne r5, r12 // otherwise, sets Z if exponent of y is 7FF
	beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm

	// Calculate the sign of the result, and put it in an unused bit of r14.
	eor r4, xh, yh // XOR the input signs to get the result sign
	orr r14, r14, r4, lsr #31 // save it in the low bit of r14

	// Clear the exponent and sign bits from the top word of each mantissa, and
	// set the leading mantissa bit in each one, so that they're in the right
	// form to be multiplied.
	bic xh, xh, r12, lsl #5 // r12 = 0x07FF0000, so r12 << 5 = 0xFF800000
	bic yh, yh, r12, lsl #5
	orr xh, xh, #(1 << 20)
	orr yh, yh, #(1 << 20)

	// Now we're ready to multiply mantissas. This is also the place we'll come
	// back to after decoding denormal inputs. The denormal decoding will also
	// have to set up the same register contents:
	// - fractions in xh/xl and yh/yl, with leading bits at bit 20 of xh/yh
	// - exponents in r14 and r5, starting at bit 16
	// - output sign in r14 bit 0
	LOCAL_LABEL(mul):

	// Multiply the two mantissas as if they were full 64-bit words, delivering a
	// 128-bit output in four registers. We provide three different ways to do
	// this, using different instructions.
	//
	// Interleaved with the multiplication code, we also compute the output
	// exponent by adding the input exponents and rebiasing. This takes two
	// instructions. We schedule each one after a multiplication, to use a delay
	// slot from the multiplication on CPUs where there is one.
	//
	// We add r5 to r14, so that the output exponent is in the top half of r14,
	// and r5 is freed up to be used in the multiplication.
	//
	// We rebias the exponent by subtracting 0x400, which is correct for one of
	// the two places where the leading bit of the product could end up, and will
	// need correcting by one in the other case.
	//
	// Exit conditions from the three-way #if:
	//
	// r4:r5:r6 are the top 96 bits of the 128-bit product, with the leading bit
	// at either bit 8 or bit 9 of r4. The low bit of r6 is forced to 1 if any of
	// the low 32 bits of the 128-bit product were set.
	//
	// The output sign is still in the low bit of r14; the top half contains the
	// preliminary output exponent (yet to be adjusted depending on where the
	// high bit of the product ended up).

	#if __ARM_FEATURE_DSP
	// The UMAAL instruction, which computes a 64-bit product and adds two
	// separate 32-bit values to it, makes this easy.
	umull r6, r4, xh, yl
	add r14, r14, r5 // add exponents, freeing up r5
	umull r12, r5, xl, yl
	sub r14, r14, #0x4000000 // initial rebiasing of exponent
	umaal r6, r5, xl, yh
	umaal r5, r4, xh, yh
	#elif ARM_FP_DMUL_USE_UMLAL
	// The UMLAL instruction computes a 64-bit product and adds a 64-bit value to
	// it. But it doesn't write to the carry flag, so you can't tell if the
	// addition wrapped. Therefore you have to use it in a way that means the
	// addition never wraps. Here we do three of the four multiplications (xl*yl,
	// xlyh, xhyh) in a chain, using UMLAL for the top two, in each case with
	// the 64-bit accumulator consisting of the top half of the previous
	// multiplication, and a high word set to zero before the UMLAL instruction.
	//
	// On Cortex-M3, this is not a win over just using UMULL and doing the
	// additions by hand, because UMLAL takes two cycles longer than UMULL, and
	// it also costs a cycle to initialise each of the two high accumulator words
	// to zero. If the high word of the addend were not zero then those two
	// cycles would be doing something useful, but as it is, they're wasted time.
	//
	// CPUs later than Cortex-M3 - in particular, Cortex-M4 - will do both UMLAL
	// and UMULL much faster, so that this code is a win over the plain UMULL
	// code below. But those CPUs typically have UMAAL anyway and will use the
	// even faster version of the code above. So this code is provided in case
	// it's useful, but won't be enabled unless you manually #define
	// ARM_FP_DMUL_USE_UMLAL.
	umull r12, r6, xl, yl
	add r14, r14, r5 // add exponents, freeing up r5
	movs r5, #0
	umlal r6, r5, xl, yh
	movs r4, #0
	umlal r5, r4, xh, yh
	sub r14, r14, #0x4000000 // initial rebiasing of exponent
	umull xl, yh, xh, yl
	adds r6, r6, xl
	adcs r5, r5, yh
	adc r4, r4, #0
	#else
	// Simplest approach, using plain UMULL to compute each 64-bit product, and
	// separate ADD and ADC instructions to do the additions. On Cortex-M3 this
	// wins over the UMLAL approach: it's one instruction longer, but three
	// cycles quicker, since each use of UMLAL in the above version costs 2
	// cycles.
	umull r4, r12, xh, yl
	add r14, r14, r5 // add exponents, freeing up r5
	umull r6, r5, xl, yh
	sub r14, r14, #0x4000000 // initial rebiasing of exponent
	adds r6, r6, r4
	adcs r5, r5, r12 // carry from here is used below

	umull r4, r12, xh, yh // r12:r4 is top part
	adc yh, r12, #0 // get carry from above addition
	umull r12, xh, xl, yl // xh:r12 is bottom part

	adds r6, r6, xh
	adcs r5, r5, r4
	adcs r4, yh, #0
	#endif

	// Now the full 128-bit product of the two mantissas occupies the four
	// registers r4,r5,r6,r12 (in order from MSW to LSW). Since each input
	// mantissa was in the range [2^52,2^53), the product is in the range
	// [2^104,2^106), which means that the lowest-order word r12 is a long way
	// below the round bit, so that it can only affect cases so close to a
	// rounding boundary that you need to know if it's nonzero to tell whether
	// you're rounding to even. Start by freeing up that register, ensuring the
	// low bit of r6 is set if anything in r12 was nonzero.
	tst r12, r12
	orrne r6, r6, #1

	// Now we can regard the result as a 96-bit value in r4,r5,r6, with its
	// leading bit in either bit 8 or 9 of r4. To move that bit up to its final
	// position in bit 20, we must shift the whole thing left by either 11 or 12
	// bits. Find out which.
	tst r4, #0x200 // is bit 9 set?
	bne LOCAL_LABEL(shift11) // if so, only shift by 11 bits

	// In this branch, we're shifting left by 12 bits. Put the shifted result
	// back into the output registers xh,xl, and the bits lower than the bottom
	// mantissa bit into r4.
	lsls xh, r4, #12 // shift each input reg left 12
	lsls xl, r5, #12
	lsls r4, r6, #12
	orr xh, xh, r5, lsr #20 // and the top two right by 32-12
	orr xl, xl, r6, lsr #20

	b LOCAL_LABEL(shifted)

	LOCAL_LABEL(shift11):
	// In this branch, we're shifting left by 11 bits instead of 12, and we must
	// adjust the exponent by 1 to compensate.
	lsls xh, r4, #11 // shift each input reg left 11
	lsls xl, r5, #11
	lsls r4, r6, #11
	orr xh, xh, r5, lsr #21 // and the top two right by 32-11
	orr xl, xl, r6, lsr #21
	add r14, r14, #0x10000 // adjust the exponent

	LOCAL_LABEL(shifted):
	// We've reconverged after shifting the mantissa, so that now the leading 1
	// bit of the mantissa is in bit 20 of xh, and r4 contains the bits lower
	// than the bottom of xl.

	// Recombine the sign and exponent into the high bits of xh. If the exponent
	// is over- or underflowed, this may not give a valid FP result, but because
	// everything is put on by addition, it will be right "mod 2^64" so that we
	// can bias the exponent back into range for underflow handling and that will
	// recover the right sign.
	//
	// r14 still has the output sign in its low bit. To extract just the exponent
	// for adding to xh, we could use BIC to clear that bit, or shift the value
	// right. We do the latter, which saves a copy of the pre-rounding exponent
	// in yl, to use later for overflow detection. The shift is ASR, so that if
	// the exponent is negative due to underflow, it stays negative.
	asr yl, r14, #16 // isolate the exponent
	add xh, xh, yl, lsl #20 // shift it back up to add to xh
	add xh, xh, r14, lsl #31 // then add the sign

	// If we have to handle an underflow, we'll need enough information to
	// reconstruct the rounding direction. Our strategy is
	//
	// - save the LSW of the output before rounding: if that differs from the
	// LSW after rounding then we rounded up
	// - save the round word r4: if that is zero then we didn't round at all.
	//
	// We're going to branch past the rounding code for a quicker exit in the
	// case where we're exact. In that case we don't need to save the output LSW
	// at all, because the zero round word will override whatever it would have
	// been anyway.
	movs r6, r4 // unconditionally save round word
	beq LOCAL_LABEL(rounded) // branch past rounding code if exact
	mov r5, xl // and if not, save output LSW too

	// Rounding: we shift r4 left to put the round bit into the carry flag so
	// that ADCS+ADC will conditionally increment the mantissa. But before we do
	// the additions, we also check the Z flag, which tells us whether the
	// remaining 31 bits are all zero. If so, we're either in the round-to-even
	// (RTE) halfway case, or the exact case - but the exact case never came
	// through this code at all, so it must be RTE.
	//
	// If those 31 bits _aren't_ all zero, we clear the top bit of r4, leaving it
	// set only in the round-to-even case. Then (r4 >> 31) can be used to clear
	// the low bit to perform RTE.
	lsls r12, r4, #1 // test round word
	bicne r4, r4, #0x80000000 // make top bit of r4 into the RTE bit
	adcs xl, xl, #0 // conditionally increment the mantissa
	adc xh, xh, #0 // ... and carry into its high word
	bic xl, xl, r4, lsr #31 // round to even if r4[31] != 0

	LOCAL_LABEL(rounded):
	// Now we've rounded the output. The last thing we must do is check for
	// overflow and underflow: if neither has happened, we can return.
	//
	// yl contains the pre-rounding output exponent minus 1 (so that the leading
	// mantissa bit incremented it to the right output value). If this is in the
	// range [0,0x7fd] then the leading bit would have incremented it to
	// [1,0x7fe], which are non-overflowed output exponents. So an unsigned check
	// if yl >= 0x7fe detects both overflow and underflow at once.
	movw r12, #0x7FE
	cmp yl, r12
	poplo {r4,r5,r6,pc}

	// We have either an underflow or an overflow. We can tell which it is by
	// doing a _signed_ comparison of yl with the same value again - and since we
	// only just did the CMP instruction, we can reuse the same flags.
	bge LOCAL_LABEL(overflow)

	// Now we're dealing with an underflow. Set r2 to the rounding direction, by
	// first checking xl against r5 (where we saved its pre-rounding value) to
	// see if we rounded up or down, and then overriding that by checking r6
	// (where we saved the round word) to see if we didn't round at all. In the
	// latter case the comparison against r5 will deliver nonsense, but then we
	// overwrite it, so it doesn't matter.
	cmp xl, r5 // did we modify the LSW, i.e. round up?
	movne r2, #-1 // if so, the true value is a bit smaller
	moveq r2, #+1 // else it's a bit bigger
	cmp r6, #0 // except maybe we didn't round at all
	moveq r2, #0 // in which case the true value is exact.

	// Add the IEEE 754 exponent bias, and tail-call __dunder to handle the rest
	// of the job.
	add xh, xh, #0x60000000
	pop {r4,r5,r6,lr}
	b SYMBOL_NAME(__compiler_rt_dunder)

	LOCAL_LABEL(overflow):
	// Here, we overflowed, so we must return an infinity of the correct sign.
	// Rebias the exponent, which corrects the sign bit.
	sub xh, xh, #0x60000000

	// And pop our scratch registers before falling through into dmul_retinf.
	pop {r4,r5,r6,lr}

	LOCAL_LABEL(retinf):
	// This is entered from the overflow handler and also from cases with
	// infinite inputs. It constructs an infinity, with sign bit equal to the
	// high bit of xh.
	//
	// On entry to here, we expect not to have a stack frame any more, because
	// one of our callers will have popped it already in order to conditionally
	// tailcall __dnan2.
	mov xl, #0 // clear low word
	mvn xh, xh, lsr #31 // shift xh[31] down to bit 0, inverted
	mvn xh, xh, lsl #11 // uninvert, and put exponent 0x7ff below it
	lsl xh, xh, #20 // shift back up to the top
	bx lr

	LOCAL_LABEL(uncommon):
	// We come here from the entry point, if any input had exponent 0 or 0x7ff.
	// First we must repeat the instruction from the entry point that sets up r5
	// with the exponent of y, this time unconditionally, so we know we have both
	// exponents in the top halves of r14 and r5.
	and r5, r12, yh, lsr #4

	// Check if either exponent is 0x7ff, by comparing against the value left in
	// r12 by the entry point. If so, branch away to handle NaNs and infinities.
	teq r14, r12
	teqne r5, r12
	beq LOCAL_LABEL(naninf)

	// If we didn't branch, we're dealing with finite numbers, including a zero
	// or a denormal or both.
	//
	// First save the output sign.
	eor r6, xh, yh

	// Handle zeroes first, because if there's a zero we don't have to worry
	// about denormals at all.
	orrs r4, xl, xh, lsl #1 // is x zero?
	orrsne r4, yl, yh, lsl #1 // or is y zero?
	beq LOCAL_LABEL(retzero) // Return zero if so

	// Otherwise, delegate to __dnorm2 to handle denormals, converting them into
	// a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
	// exponents at the bottom of their words instead of half way up, so shift
	// down first, and back up again afterwards.
	//
	// This call clobbers r12, because we didn't bother to save it on the stack.
	// That's fine, because we don't need the constant in it any more. When we go
	// back to dmul_mul, that will use it as a scratch register.
	lsr r4, r14, #16
	lsr r5, r5, #16
	push {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
	mov r0, sp // pass it by address
	bl SYMBOL_NAME(__compiler_rt_dnorm2)
	pop {r0, r1, r2, r3, r4, r5}
	lsl r14, r4, #16
	lsls r5, r5, #16

	// Put the output sign at the bottom of r14, the same place the fast path
	// would have left it. Then rejoin the fast path.
	orr r14, r14, r6, lsr #31
	b LOCAL_LABEL(mul)

	LOCAL_LABEL(retzero):
	// Return an exact zero, with sign bit from the high bit of r6.
	mov xl, #0 // low word is 0
	ands xh, r6, #0x80000000 // high word is 0 except for the sign
	pop {r4,r5,r6,pc}

	LOCAL_LABEL(naninf):
	// We come here knowing that at least one operand is either NaN or infinity.
	// If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
	// stacked registers first: we won't need that much spare space any more, and
	// it makes the tailcall easier if we've already done it.
	pop {r4,r5,r6,lr}

	// A number is a NaN if its exponent is 0x7ff and at least one bit below that
	// is set. The CMP + ADC pair here converts the two words xh:xl into a single
	// word containing xh shifted up by one (throwing away the sign bit which
	// makes no difference), with its low bit set if xl was nonzero. So if that
	// is strictly greater than 0xffe00000, then x was a NaN.
	cmp xl, #1
	adc r12, xh, xh
	cmp r12, #0xFFE00000
	bhi SYMBOL_NAME(__compiler_rt_dnan2)
	// Now check y in the same way.
	cmp yl, #1
	adc r12, yh, yh
	cmp r12, #0xFFE00000
	bhi SYMBOL_NAME(__compiler_rt_dnan2)

	// Now we know there are no NaNs. Therefore there's at least one infinity. If
	// either operand is zero then we have inf * 0 = invalid operation and must
	// return a NaN.
	orrs r12, xl, xh, lsl #1 // are all bits of x zero except the sign?
	beq LOCAL_LABEL(retnan) // if so, x == 0, so y == inf
	orrs r12, yl, yh, lsl #1 // same check the other way round
	beq LOCAL_LABEL(retnan)

	// If we have an infinity and no NaN, then we just return an infinity of the
	// correct sign.
	eor xh, xh, yh
	b LOCAL_LABEL(retinf)

	LOCAL_LABEL(retnan):
	// Return the default NaN, in the case where the inputs were 0 and infinity.
	movw xh, 0x7ff8
	lsls xh, xh, #16
	mov xl, #0
	bx lr

	END_COMPILERRT_FUNCTION(__aeabi_dmul)

	NO_EXEC_STACK_DIRECTIVE