blob: b73cd7580fbf25afa1f2daad9ae9dae319cc2cd0 [file] [edit]
//===-- muldf3.S - double-precision floating point multiplication ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the __muldf3 function (double precision floating point
// multiplication), with the IEEE-754 default rounding (to nearest, ties to
// even), for the Arm and Thumb2 ISAs.
//
//===----------------------------------------------------------------------===//
#include "../assembly.h"
#include "crt_endian.h"
.syntax unified
.text
.p2align 2
#if __ARM_PCS_VFP
DEFINE_COMPILERRT_FUNCTION(__muldf3)
push {r4, lr}
VMOV_FROM_DOUBLE(r0, r1, d0)
VMOV_FROM_DOUBLE(r2, r3, d1)
bl __aeabi_dmul
VMOV_TO_DOUBLE(d0, r0, r1)
pop {r4, pc}
#else
DEFINE_COMPILERRT_FUNCTION_ALIAS(__muldf3, __aeabi_dmul)
#endif
DEFINE_COMPILERRT_FUNCTION(__aeabi_dmul)
push {r4,r5,r6,lr}
// Check if either input exponent is 000 or 7FF (i.e. not a normalized
// number), and if so, branch out of line. If we don't branch out of line,
// then we've also extracted the exponents of the input values x and y into
// bits 16..26 of r14 and r5 respectively. But if we do, then that hasn't
// necessarily been done (because the second AND might have been skipped).
ldr r12, =0x07FF0000
ands r14, r12, xh, lsr #4 // sets Z if exponent of x is 0
andsne r5, r12, yh, lsr #4 // otherwise, sets Z if exponent of y is 0
teqne r14, r12 // otherwise, sets Z if exponent of x is 7FF
teqne r5, r12 // otherwise, sets Z if exponent of y is 7FF
beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm
// Calculate the sign of the result, and put it in an unused bit of r14.
eor r4, xh, yh // XOR the input signs to get the result sign
orr r14, r14, r4, lsr #31 // save it in the low bit of r14
// Clear the exponent and sign bits from the top word of each mantissa, and
// set the leading mantissa bit in each one, so that they're in the right
// form to be multiplied.
bic xh, xh, r12, lsl #5 // r12 = 0x07FF0000, so r12 << 5 = 0xFF800000
bic yh, yh, r12, lsl #5
orr xh, xh, #(1 << 20)
orr yh, yh, #(1 << 20)
// Now we're ready to multiply mantissas. This is also the place we'll come
// back to after decoding denormal inputs. The denormal decoding will also
// have to set up the same register contents:
// - fractions in xh/xl and yh/yl, with leading bits at bit 20 of xh/yh
// - exponents in r14 and r5, starting at bit 16
// - output sign in r14 bit 0
LOCAL_LABEL(mul):
// Multiply the two mantissas as if they were full 64-bit words, delivering a
// 128-bit output in four registers. We provide three different ways to do
// this, using different instructions.
//
// Interleaved with the multiplication code, we also compute the output
// exponent by adding the input exponents and rebiasing. This takes two
// instructions. We schedule each one after a multiplication, to use a delay
// slot from the multiplication on CPUs where there is one.
//
// We add r5 to r14, so that the output exponent is in the top half of r14,
// and r5 is freed up to be used in the multiplication.
//
// We rebias the exponent by subtracting 0x400, which is correct for one of
// the two places where the leading bit of the product could end up, and will
// need correcting by one in the other case.
//
// Exit conditions from the three-way #if:
//
// r4:r5:r6 are the top 96 bits of the 128-bit product, with the leading bit
// at either bit 8 or bit 9 of r4. The low bit of r6 is forced to 1 if any of
// the low 32 bits of the 128-bit product were set.
//
// The output sign is still in the low bit of r14; the top half contains the
// preliminary output exponent (yet to be adjusted depending on where the
// high bit of the product ended up).
#if __ARM_FEATURE_DSP
// The UMAAL instruction, which computes a 64-bit product and adds two
// separate 32-bit values to it, makes this easy.
umull r6, r4, xh, yl
add r14, r14, r5 // add exponents, freeing up r5
umull r12, r5, xl, yl
sub r14, r14, #0x4000000 // initial rebiasing of exponent
umaal r6, r5, xl, yh
umaal r5, r4, xh, yh
#elif ARM_FP_DMUL_USE_UMLAL
// The UMLAL instruction computes a 64-bit product and adds a 64-bit value to
// it. But it doesn't write to the carry flag, so you can't tell if the
// addition wrapped. Therefore you have to use it in a way that means the
// addition never wraps. Here we do three of the four multiplications (xl*yl,
// xl*yh, xh*yh) in a chain, using UMLAL for the top two, in each case with
// the 64-bit accumulator consisting of the top half of the previous
// multiplication, and a high word set to zero before the UMLAL instruction.
//
// On Cortex-M3, this is not a win over just using UMULL and doing the
// additions by hand, because UMLAL takes two cycles longer than UMULL, and
// it also costs a cycle to initialise each of the two high accumulator words
// to zero. If the high word of the addend were not zero then those two
// cycles would be doing something useful, but as it is, they're wasted time.
//
// CPUs later than Cortex-M3 - in particular, Cortex-M4 - will do both UMLAL
// and UMULL much faster, so that this code is a win over the plain UMULL
// code below. But those CPUs typically have UMAAL anyway and will use the
// even faster version of the code above. So this code is provided in case
// it's useful, but won't be enabled unless you manually #define
// ARM_FP_DMUL_USE_UMLAL.
umull r12, r6, xl, yl
add r14, r14, r5 // add exponents, freeing up r5
movs r5, #0
umlal r6, r5, xl, yh
movs r4, #0
umlal r5, r4, xh, yh
sub r14, r14, #0x4000000 // initial rebiasing of exponent
umull xl, yh, xh, yl
adds r6, r6, xl
adcs r5, r5, yh
adc r4, r4, #0
#else
// Simplest approach, using plain UMULL to compute each 64-bit product, and
// separate ADD and ADC instructions to do the additions. On Cortex-M3 this
// wins over the UMLAL approach: it's one instruction longer, but three
// cycles quicker, since each use of UMLAL in the above version costs 2
// cycles.
umull r4, r12, xh, yl
add r14, r14, r5 // add exponents, freeing up r5
umull r6, r5, xl, yh
sub r14, r14, #0x4000000 // initial rebiasing of exponent
adds r6, r6, r4
adcs r5, r5, r12 // carry from here is used below
umull r4, r12, xh, yh // r12:r4 is top part
adc yh, r12, #0 // get carry from above addition
umull r12, xh, xl, yl // xh:r12 is bottom part
adds r6, r6, xh
adcs r5, r5, r4
adcs r4, yh, #0
#endif
// Now the full 128-bit product of the two mantissas occupies the four
// registers r4,r5,r6,r12 (in order from MSW to LSW). Since each input
// mantissa was in the range [2^52,2^53), the product is in the range
// [2^104,2^106), which means that the lowest-order word r12 is a long way
// below the round bit, so that it can only affect cases so close to a
// rounding boundary that you need to know if it's nonzero to tell whether
// you're rounding to even. Start by freeing up that register, ensuring the
// low bit of r6 is set if anything in r12 was nonzero.
tst r12, r12
orrne r6, r6, #1
// Now we can regard the result as a 96-bit value in r4,r5,r6, with its
// leading bit in either bit 8 or 9 of r4. To move that bit up to its final
// position in bit 20, we must shift the whole thing left by either 11 or 12
// bits. Find out which.
tst r4, #0x200 // is bit 9 set?
bne LOCAL_LABEL(shift11) // if so, only shift by 11 bits
// In this branch, we're shifting left by 12 bits. Put the shifted result
// back into the output registers xh,xl, and the bits lower than the bottom
// mantissa bit into r4.
lsls xh, r4, #12 // shift each input reg left 12
lsls xl, r5, #12
lsls r4, r6, #12
orr xh, xh, r5, lsr #20 // and the top two right by 32-12
orr xl, xl, r6, lsr #20
b LOCAL_LABEL(shifted)
LOCAL_LABEL(shift11):
// In this branch, we're shifting left by 11 bits instead of 12, and we must
// adjust the exponent by 1 to compensate.
lsls xh, r4, #11 // shift each input reg left 11
lsls xl, r5, #11
lsls r4, r6, #11
orr xh, xh, r5, lsr #21 // and the top two right by 32-11
orr xl, xl, r6, lsr #21
add r14, r14, #0x10000 // adjust the exponent
LOCAL_LABEL(shifted):
// We've reconverged after shifting the mantissa, so that now the leading 1
// bit of the mantissa is in bit 20 of xh, and r4 contains the bits lower
// than the bottom of xl.
// Recombine the sign and exponent into the high bits of xh. If the exponent
// is over- or underflowed, this may not give a valid FP result, but because
// everything is put on by addition, it will be right "mod 2^64" so that we
// can bias the exponent back into range for underflow handling and that will
// recover the right sign.
//
// r14 still has the output sign in its low bit. To extract just the exponent
// for adding to xh, we could use BIC to clear that bit, or shift the value
// right. We do the latter, which saves a copy of the pre-rounding exponent
// in yl, to use later for overflow detection. The shift is ASR, so that if
// the exponent is negative due to underflow, it stays negative.
asr yl, r14, #16 // isolate the exponent
add xh, xh, yl, lsl #20 // shift it back up to add to xh
add xh, xh, r14, lsl #31 // then add the sign
// If we have to handle an underflow, we'll need enough information to
// reconstruct the rounding direction. Our strategy is
//
// - save the LSW of the output before rounding: if that differs from the
// LSW after rounding then we rounded up
// - save the round word r4: if that is zero then we didn't round at all.
//
// We're going to branch past the rounding code for a quicker exit in the
// case where we're exact. In that case we don't need to save the output LSW
// at all, because the zero round word will override whatever it would have
// been anyway.
movs r6, r4 // unconditionally save round word
beq LOCAL_LABEL(rounded) // branch past rounding code if exact
mov r5, xl // and if not, save output LSW too
// Rounding: we shift r4 left to put the round bit into the carry flag so
// that ADCS+ADC will conditionally increment the mantissa. But before we do
// the additions, we also check the Z flag, which tells us whether the
// remaining 31 bits are all zero. If so, we're either in the round-to-even
// (RTE) halfway case, or the exact case - but the exact case never came
// through this code at all, so it must be RTE.
//
// If those 31 bits _aren't_ all zero, we clear the top bit of r4, leaving it
// set only in the round-to-even case. Then (r4 >> 31) can be used to clear
// the low bit to perform RTE.
lsls r12, r4, #1 // test round word
bicne r4, r4, #0x80000000 // make top bit of r4 into the RTE bit
adcs xl, xl, #0 // conditionally increment the mantissa
adc xh, xh, #0 // ... and carry into its high word
bic xl, xl, r4, lsr #31 // round to even if r4[31] != 0
LOCAL_LABEL(rounded):
// Now we've rounded the output. The last thing we must do is check for
// overflow and underflow: if neither has happened, we can return.
//
// yl contains the pre-rounding output exponent minus 1 (so that the leading
// mantissa bit incremented it to the right output value). If this is in the
// range [0,0x7fd] then the leading bit would have incremented it to
// [1,0x7fe], which are non-overflowed output exponents. So an unsigned check
// if yl >= 0x7fe detects both overflow and underflow at once.
movw r12, #0x7FE
cmp yl, r12
poplo {r4,r5,r6,pc}
// We have either an underflow or an overflow. We can tell which it is by
// doing a _signed_ comparison of yl with the same value again - and since we
// only just did the CMP instruction, we can reuse the same flags.
bge LOCAL_LABEL(overflow)
// Now we're dealing with an underflow. Set r2 to the rounding direction, by
// first checking xl against r5 (where we saved its pre-rounding value) to
// see if we rounded up or down, and then overriding that by checking r6
// (where we saved the round word) to see if we didn't round at all. In the
// latter case the comparison against r5 will deliver nonsense, but then we
// overwrite it, so it doesn't matter.
cmp xl, r5 // did we modify the LSW, i.e. round up?
movne r2, #-1 // if so, the true value is a bit smaller
moveq r2, #+1 // else it's a bit bigger
cmp r6, #0 // except maybe we didn't round at all
moveq r2, #0 // in which case the true value is exact.
// Add the IEEE 754 exponent bias, and tail-call __dunder to handle the rest
// of the job.
add xh, xh, #0x60000000
pop {r4,r5,r6,lr}
b SYMBOL_NAME(__compiler_rt_dunder)
LOCAL_LABEL(overflow):
// Here, we overflowed, so we must return an infinity of the correct sign.
// Rebias the exponent, which corrects the sign bit.
sub xh, xh, #0x60000000
// And pop our scratch registers before falling through into dmul_retinf.
pop {r4,r5,r6,lr}
LOCAL_LABEL(retinf):
// This is entered from the overflow handler and also from cases with
// infinite inputs. It constructs an infinity, with sign bit equal to the
// high bit of xh.
//
// On entry to here, we expect not to have a stack frame any more, because
// one of our callers will have popped it already in order to conditionally
// tailcall __dnan2.
mov xl, #0 // clear low word
mvn xh, xh, lsr #31 // shift xh[31] down to bit 0, inverted
mvn xh, xh, lsl #11 // uninvert, and put exponent 0x7ff below it
lsl xh, xh, #20 // shift back up to the top
bx lr
LOCAL_LABEL(uncommon):
// We come here from the entry point, if any input had exponent 0 or 0x7ff.
// First we must repeat the instruction from the entry point that sets up r5
// with the exponent of y, this time unconditionally, so we know we have both
// exponents in the top halves of r14 and r5.
and r5, r12, yh, lsr #4
// Check if either exponent is 0x7ff, by comparing against the value left in
// r12 by the entry point. If so, branch away to handle NaNs and infinities.
teq r14, r12
teqne r5, r12
beq LOCAL_LABEL(naninf)
// If we didn't branch, we're dealing with finite numbers, including a zero
// or a denormal or both.
//
// First save the output sign.
eor r6, xh, yh
// Handle zeroes first, because if there's a zero we don't have to worry
// about denormals at all.
orrs r4, xl, xh, lsl #1 // is x zero?
orrsne r4, yl, yh, lsl #1 // or is y zero?
beq LOCAL_LABEL(retzero) // Return zero if so
// Otherwise, delegate to __dnorm2 to handle denormals, converting them into
// a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
// exponents at the bottom of their words instead of half way up, so shift
// down first, and back up again afterwards.
//
// This call clobbers r12, because we didn't bother to save it on the stack.
// That's fine, because we don't need the constant in it any more. When we go
// back to dmul_mul, that will use it as a scratch register.
lsr r4, r14, #16
lsr r5, r5, #16
push {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
mov r0, sp // pass it by address
bl SYMBOL_NAME(__compiler_rt_dnorm2)
pop {r0, r1, r2, r3, r4, r5}
lsl r14, r4, #16
lsls r5, r5, #16
// Put the output sign at the bottom of r14, the same place the fast path
// would have left it. Then rejoin the fast path.
orr r14, r14, r6, lsr #31
b LOCAL_LABEL(mul)
LOCAL_LABEL(retzero):
// Return an exact zero, with sign bit from the high bit of r6.
mov xl, #0 // low word is 0
ands xh, r6, #0x80000000 // high word is 0 except for the sign
pop {r4,r5,r6,pc}
LOCAL_LABEL(naninf):
// We come here knowing that at least one operand is either NaN or infinity.
// If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
// stacked registers first: we won't need that much spare space any more, and
// it makes the tailcall easier if we've already done it.
pop {r4,r5,r6,lr}
// A number is a NaN if its exponent is 0x7ff and at least one bit below that
// is set. The CMP + ADC pair here converts the two words xh:xl into a single
// word containing xh shifted up by one (throwing away the sign bit which
// makes no difference), with its low bit set if xl was nonzero. So if that
// is strictly greater than 0xffe00000, then x was a NaN.
cmp xl, #1
adc r12, xh, xh
cmp r12, #0xFFE00000
bhi SYMBOL_NAME(__compiler_rt_dnan2)
// Now check y in the same way.
cmp yl, #1
adc r12, yh, yh
cmp r12, #0xFFE00000
bhi SYMBOL_NAME(__compiler_rt_dnan2)
// Now we know there are no NaNs. Therefore there's at least one infinity. If
// either operand is zero then we have inf * 0 = invalid operation and must
// return a NaN.
orrs r12, xl, xh, lsl #1 // are all bits of x zero except the sign?
beq LOCAL_LABEL(retnan) // if so, x == 0, so y == inf
orrs r12, yl, yh, lsl #1 // same check the other way round
beq LOCAL_LABEL(retnan)
// If we have an infinity and no NaN, then we just return an infinity of the
// correct sign.
eor xh, xh, yh
b LOCAL_LABEL(retinf)
LOCAL_LABEL(retnan):
// Return the default NaN, in the case where the inputs were 0 and infinity.
movw xh, 0x7ff8
lsls xh, xh, #16
mov xl, #0
bx lr
END_COMPILERRT_FUNCTION(__aeabi_dmul)
NO_EXEC_STACK_DIRECTIVE