blob: ebbd868a04b17ed2050feae54855846e5a291303 [file] [edit]
//===-- divdf3.S - double-precision floating point division ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the __divdf3 function (double precision floating point
// division), with the IEEE-754 default rounding (to nearest, ties to even),
// for the Arm and Thumb2 ISAs.
//
//===----------------------------------------------------------------------===//
#include "../assembly.h"
#include "crt_endian.h"
// The basic strategy of this division code is to use Newton-Raphson iteration
// to calculate an approximation to 1/y, then multiply it by x. This procedure
// delivers a quotient with 10 extra bits of precision, but which isn't exact.
// We know an upper bound on its possible error, which gives an interval of
// possible values for the true quotient. So we can check the 10 extra bits to
// see whether a rounding boundary lies within the interval. If not, then we
// can round and return without worrying further; otherwise, we go to slower
// correction code that multiplies the approximate quotient back up by y and
// checks it against x.
//
// This strategy depends critically on the upper bound on the approximation
// error. Underestimating the error introduces a bug; overestimating it costs
// performance, by sending more cases than necessary to the slow path.
//
// To give high confidence of its correctness, the upper bound has been proved
// formally by Gappa. The Gappa proof and auxiliary code are not included in
// this version, but they can be found in the Arm Optimized Routines repository
//
// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/at32/ddiv.S
// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-prove.py
// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-diagnostics.c
//
// and a pair of blog posts describing the concepts and procedure are here:
//
// https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p1
// https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p2
.syntax unified
.text
.p2align 2
#if __ARM_PCS_VFP
DEFINE_COMPILERRT_FUNCTION(__divdf3)
push {r4, lr}
VMOV_FROM_DOUBLE(r0, r1, d0)
VMOV_FROM_DOUBLE(r2, r3, d1)
bl __aeabi_ddiv
VMOV_TO_DOUBLE(d0, r0, r1)
pop {r4, pc}
#else
DEFINE_COMPILERRT_FUNCTION_ALIAS(__divdf3, __aeabi_ddiv)
#endif
DEFINE_COMPILERRT_FUNCTION(__aeabi_ddiv)
push {r4,r5,r6,r7,r8,lr}
// Check if either input exponent 7FF (infinity or NaN), and if so, branch
// out of line.
ldr r12, =0x07FF0000 // mask for exponent cold storage
bics r4, r12, xh, lsr #4 // test for Infs or NaNs
bicsne r4, r12, yh, lsr #4
beq LOCAL_LABEL(ddiv_naninf)
// Extract the exponents of the input values x and y into bits 16..26 of r14
// and r5 respectively, and in the process, check if either exponent is zero
// (so that one or both inputs are 0 or denormal). In order to combine the
// two tests, the second ANDS is performed conditionally, so that if x's
// exponent is zero then the out-of-line code at ddiv_zerodenorm might find
// y's exponent hasn't been set up yet.
//
// We also calculate the sign of the result, which will be needed whether or
// not we branch. This is saved in the low bit of r4.
ands r4, r12, xh, lsr #4 // get exponent of x, setting Z if it's 0
andsne r5, r12, yh, lsr #4 // if not, extract and test exponent of y
eor r6, xh, yh // XOR the input signs to get the result sign
orr r4, r4, r6, lsr #31 // save it in the low bit of r4
beq LOCAL_LABEL(ddiv_zerodenorm) // branch out of line for zeroes or denormals
// Calculate the initial exponent of the result, by subtracting the two input
// exponents and adjusting for the IEEE exponent bias. This value may have to
// be adjusted by 1 later, depending on the quotient of the mantissas.
//
// If we branched to ddiv_zerodenorm above, and it found denormals but no
// zeroes, it may branch back here after renormalising them. We expect the
// out-of-line code to have left the exponent difference in the top half of
// r4 (still with the output sign in the low bit), but not yet to have
// applied the bias. So it branches back in immediately after the SUB.
//
// The exponent bias we want is either 0x3fe or 0x3ff, depending on whether
// we have to shift the output mantissa by 1 below. Neither of those values
// fits in the immediate field of an ADD instruction, so we must use two
// instructions.
sub r4, r4, r5
LOCAL_LABEL(ddiv_normalised): // denormal handler will come back to here
add r4, r4, #0x03FC0000 // add the 8 high bits of the bias 0x3FE
add r4, r4, #0x00020000 // add the remaining bit of the bias
// Shift both mantissas up to the top of their 64-bit register pair, and OR
// in the leading 1 bit, which will occupy the high bit of the high word in
// each case.
mov r5, #(1 << 31) // high bit for ORing in to both mantissas
orr xh, r5, xh, lsl #11 // shift up xh and OR in the high bit
orr yh, r5, yh, lsl #11 // same for yh
orr xh, xh, xl, lsr #21 // OR in the bits shifted out of xl into xh
orr yh, yh, yl, lsr #21 // same for yl and yh
lsl xl, xl, #11 // shift up the rest of xl
lsl yl, yl, #11 // same for yl
// Check if the two mantissas are exactly equal, so that the quotient is
// exactly a power of 2. If so, branch out of line to handle that case
// specially.
//
// This guarantees that when we examine the approximate quotient afterwards,
// we can't be confused about whether it needs to be renormalised, which
// would otherwise cost just as much effort as this check. Our reciprocal
// approximation is always an underestimate (that's in the nature of this
// particular Newton-Raphson iteration), so if x < y (meaning the mantissas
// rather than the whole floats) then even the true quotient will be less
// than 1, and the approximation even more so. On the other hand, if x > y,
// then the true quotient will be enough greater than 1 that even the largest
// possible error in the approximation can't make it look like less than 1.
//
// (Proof: regard x,y as normalised to the range [1,2). If x > y, then we
// have x ≥ y+ε, where ε is the machine epsilon. So x/y ≥ 1+ε/y > 1+ε/2. And
// the bound on the approximation error, given below, is far less than ε/2.)
cmp xh, yh
cmpeq xl, yl
beq LOCAL_LABEL(ddiv_result_is_power_of_2)
// Now we begin the actual calculation of the reciprocal approximation.
//
// We begin with our two input mantissas stored in xh:xl and yh:yl, each with
// its leading 1 explicit and shifted up to the top of the word. So they can
// be regarded as 64-bit integers with the high bit set and the bottom 11
// bits clear.
// Obtain an 8-bit reciprocal approximation by using the topmost 8 bits of y
// as a lookup table. The top bit of y is always set, so there are only 128
// lookup table entries, not 256. The 8-bit value we load also has its top
// bit set.
lsr r5, yh, #24 // r5 is the table index plus 0x80
// Get the address of reciptbl, in various ways depending on position-
// independence and Arm/Thumb state.
//
// Since the table index calculated above in r5 includes the high mantissa
// bit, an index of 0x80 refers to the first table entry and 0xFF the last.
// So we subtract 0x80 from the table address to compensate.
#if defined __pic__ || defined __PIC__ || defined __ARM_ROPI
// In PIC or ROPI modes, we must construct the address in a pc-relative
// manner, by making a literal containing the offset from the current code.
// The reference point for that offset is the value of pc as read by the add
// instruction at get_reciptbl below, which will be 4 or 8 bytes after it in
// Thumb or Arm state respectively.
#if __thumb__
ldr r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+4)
#else
ldr r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+8)
#endif
LOCAL_LABEL(get_reciptbl):
add r6, r6, pc
#else
// If we're not building for position independence, we can just load the
// target address directly.
ldr r6, =(LOCAL_LABEL(reciptbl)-0x80)
#endif
ldrb r6, [r6, r5] // and load the approximation into r6
// First Newton-Raphson iteration, which expands that 8-bit approximation to
// a 17-bit one, again with its top bit set. We use the top 16 bits of y for
// this, so that we can fit the multiplications into ordinary MUL rather than
// UMULL.
//
// The Newton-Raphson formula to turn an approximation r ≈ 1/y into a better
// one is r → r(2-yr). In this case we're scaling up to integers (informal
// fixed point), so the 2 becomes 2^24.
lsr r5, yh, #16 // get top halfword of y
mul r7, r6, r5 // multiply it by the input value r
rsb r7, r7, #(1 << 24) // subtract from 2 (scaled up appropriately)
mul r7, r6, r7 // multiply again to make r(2-yr)
lsr r7, r7, #14 // shift down to keep only 17 bits of it
// Second iteration, expanding into a 32-bit reciprocal, using the top 31
// bits of y (i.e. yh shifted by 1). The first multiplication (making yr) is
// 32x32 → 64 bits, so we use a single UMULL; the second one making r(2-yr)
// is 32x64, which we do with a UMULL by the bottom half of yr and then MLA
// by the top half, so we only keep the low 64 bits of the full answer.
//
// The subtraction from 2 (again scaled up, this time to 2^48) is done by
// RSBS+RSC, interleaved with the multiplications so as to use a delay slot
// on CPUs that have one.
lsr r12, yh, #1
umull r6, r8, r7, r12 // r8:r6 = yr
rsbs r6, r6, #0 // low half of subtraction from 2
umull r12, lr, r7, r6 // multiply r by the low half of 2-yr
#if __thumb__
// Thumb has no RSC, so simulate it by bitwise inversion and then ADC
mvn r8, r8
adc r8, r8, #(1 << 16)
#else
rsc r8, r8, #(1 << 16) // high half of subtraction from 2
#endif
mla r6, r7, r8, lr // multiply r by the high half of 2-yr
// Third iteration, expanding into a 64-bit reciprocal, with the leading bit
// expected to end up in bit 60. Now the first multiplication to make yr is
// 32x64 → 96 bits, so we put the product in three registers lr:r12:r8.
// However, we're going to discard the low word r8 completely, because it
// makes negligible difference. So we'll treat the output yr as 64-bit.
umull r8, r12, r6, yl // multiply r by bottom half of y
mov lr, #0 // initialize high word to 0
umlal r12, lr, r6, yh // multiply r by top half of y
// Subtract from a power of 2, as usual. But in this case the power of 2
// we're subtracting from is 2^64, which is just off the top of the 64-bit
// value in lr:r12. So in fact we're just negating the whole thing!
//
// To preserve the invariant that the approximation error is always negative,
// we negate via one's complement rather than two's. (This would only make a
// difference if r8 had happened to be exactly 0. That in turn can occur when
// yl=0, so one of the test cases in ddiv-diagnostics.c deliberately uses
// such a value, so that the intermediate results can be checked against the
// reference Python.)
mvn r12, r12
mvn lr, lr
// Now lr:r12:r8 contains 2-yr. We discard the low word r8 to reduce that to
// 64 bits, and do another 32x64 → 96 bit multiplication.
umull r5, r8, r6, r12 // multiply r by bottom half of 2-yr
mov r7, #0 // initialize high word to 0
umlal r8, r7, r6, lr // multiply r by top half of 2-yr
// That's the Newton-Raphson iteration done: we have a 64-bit approximation
// to 1/y. Multiply it by x to get the full approximate quotient.
//
// In principle, this would be a 64x64 → 128 bit multiplication, involving
// four long multiply instructions. But we only need the top 64 bits, and
// we're already prepared to tolerate some error in the calculations, so we
// cut corners: don't multiply the two low words together at all, and we
// discard the bottom half of each of the (low * high) partial products
// without bothering to propagate carries out of it.
//
// (All of these shortcuts are faithfully mimicked in the Python reference
// implementation which generates Gappa input, so they're all accounted for
// in the error analysis.)
#if __ARM_FEATURE_DSP
umull r12, r6, xh, r8 // r6 = high word of x * low word of 1/y
umull r12, r5, xl, r7 // r5 = low word of x * high word of 1/y
umaal r6, r5, xh, r7 // add those to the product of both high words
#else
// Alternative instruction sequence using UMLAL, if UMAAL isn't available
umull r12, r6, xh, r8 // r6 = high word of x * low word of 1/y
umull r12, lr, xl, r7 // lr = low word of x * high word of 1/y
adds r6, r6, lr // add those together
mov r5, #0 // set r5 to the carry out of that addition
adc r5, r5, #0
umlal r6, r5, xh, r7 // add that to the product of both high words
#endif
// Now r5:r6 is the completed approximate quotient, with its leading bit at
// position either 61 or 62.
// Normalize so that the leading bit is always in bit 60, by shifting left if
// it isn't there already, and adjusting the output exponent by 1 to
// compensate.
//
// We do the test in a slightly tricky way, by arranging to set the V flag if
// the leading bit is in bit 60. This allows us to do the left shift under
// the VC condition, which is convenient because the LSLS instruction that
// shifts the low word left moves the top bit into the C flag without
// affecting V.
//
// We also save the value written into lr by the initial ADDS instruction,
// because that contains enough information to tell us whether we
// renormalised here. The correction path for quotients too close to a
// rounding boundary will need to recover that information.
adds lr, r5, #0x40000000 // set V flag if bit 62 of high word set
subvc r4, r4, #(1 << 16) // if not, correct the exponent by 1,
lslsvc r6, r6, #1 // shift the low word of the quotient left
adcvc r5, r5, r5 // and shift its top bit into the high word
// Now r5:r6 is the _normalised_ approximate quotient, with its leading bit
// reliably in bit 60. This is the final output of the calculation that the
// Gappa error-analysis proof applies to.
// That 64-bit output has bit 63 clear; the leading 1 bit of the output
// mantissa in bit 62, followed by 52 more mantissa bits; then 10 bits at the
// bottom which are used for determining rounding.
//
// Compute the _approximately_ rounded-to-nearest output mantissa, by adding
// half a ULP and shifting down. If we don't go to the slow path, this is the
// correct output mantissa. (See fdiv.S for the proof that the round-to-even
// tiebreaking case can't occur in floating-point division.)
//
// We keep the original version of r6, containing the ten rounding bits, so
// that we can test it to see if we need the slow path.
adds r7, r6, #(1 << 9) // add half a ULP, copying low word into r7
adc r5, r5, #0 // propagate carry into high word
lsr r7, r7, #10 // shift low word right
orr r7, r7, r5, lsl #22 // combine with bits shifted out of high word
lsr r5, r5, #10 // shift high word right
// Now test r6 to see whether this output mantissa can be relied on, or
// whether the approximation landed too close to a rounding boundary.
//
// The maximum possible error in the approximation, taking into account the
// initial error in each lookup table entry, the remaining mathematical error
// introduced by stopping after this many Newton-Raphson iterations, and
// every shortcut, right shift, truncation and discarding of a partial
// product in the algorithm above, is always negative, and less than 64 units
// in the last place of the 64-bit approximate quotient. That is, the true
// quotient lies somewhere between the 64-bit integer described as "final
// output of the calculation" above, and that plus 64.
//
// So if the bottom 10 bits of r6 have the value 2^9 or greater, we're safe,
// because the true value is _larger_ than the approximation, so if the
// approximation is already above the rounding boundary then so is the true
// value. And if those 10 bits are (2^9-64) or less then we're also safe,
// because even if the true value is greater by 63, it's still on the same
// side of the rounding boundary.
//
// We check the error by subtracting (2^9-64), so that the dangerous values
// of the bottom 10 bits are those in the range 0,...,63, i.e. precisely
// those with none of bits 6,7,8,9 set.
//
// We also combine this test with a check for underflow, because that also
// needs more careful handling (the mantissa must be re-rounded to a
// different bit position, which involves knowing whether it's exact).
// Underflow has happened if the exponent in the top half of r4 is negative
// (it's off by 1 so that the leading mantissa bit will increment it), so we
// test by an ASR#31 (copying the top bit of r4 into all of it) and negating.
// That way, the output value is zero on underflow, matching the flags from
// the other check.
sub r6, r6, #(1 << 9)-64
tst r6, #0x3C0 // now EQ means we must go to the slow path
mvnsne r12, r4, asr #31 // also set EQ if underflow has happened
beq LOCAL_LABEL(ddiv_correction) // branch out of line to do the hard bit
// If we do go to ddiv_correction, it branches back here after the correction
// code has finished. Either way, we expect that r5:r7 is the result
// mantissa, with the top bit set, already in the correct position in the
// word, and already rounded to nearest.
LOCAL_LABEL(ddiv_corrected):
// Recombine the output mantissa with the sign and exponent.
add xh, r5, r4, lsl #31 // add sign bit to top word of mantissa
bic r12, r4, #1 // isolate exponent in top half of r4
add xh, xh, r12, lsl #4 // add exponent to make the final high word
mov xl, r7 // move low word into the right register
// If there's no overflow or underflow, we're done.
//
// We _identified_ underflow above when we went to the slow path, but having
// done that, the slow path came back here, so we must check for it again.
// (The only purpose of the detour was to obtain accurate information about
// whether the quotient is exact, or needed rounding.)
//
// The output exponent, offset downwards by 1, is in the top half of r4. If
// it's negative, there's an underflow; if it's too large, there's an
// overflow. We do an approximate test for both at once via an unsigned
// comparison against 0x7f0, using r12 (the register in which we already
// cleared the sign bit stored at the bottom). This identifies _most_ normal
// outputs as quickly as possible.
//
// 0x7f0 isn't the maximum possible known-safe exponent, but it's the largest
// one that fits in the immediate field of CMP. We deal with the remaining
// cases in the next few instructions.
cmp r12, #(0x7f0 << 16)
popls {r4,r5,r6,r7,r8,pc}
// Now check the remaining cases more carefully.
//
// If r12 < 0 then we definitely have underflow. We detect overflow precisely
// by seeing if the _final_ output exponent (in the output register xh) is
// 0x7ff or more, by incrementing it and seeing if the sign is opposite from
// the intended output sign.
add lr, xh, #(1 << 20) // increment the output exponent field
teq lr, r4, lsl #31 // set N if the sign now doesn't match r4[0]
tstpl r12, r12 // otherwise, set N if underflow
poppl {r4,r5,r6,r7,r8,pc} // if neither, we've finished
// If we still haven't returned, we really do have overflow or underflow, and
// the sign of r12 tells us which.
tst r12, r12
bmi LOCAL_LABEL(ddiv_underflow)
// For overflow, correct the sign by biasing the exponent downward, and go to
// code that constructs an infinite return value (shared with the
// division-by-zero handler).
sub xh, xh, #0x60000000
pop {r4,r5,r6,r7,r8,lr} // ddiv_retinf expects no regs on the stack
b LOCAL_LABEL(ddiv_retinf)
LOCAL_LABEL(ddiv_correction):
// The slow path, entered if the approximate quotient was too close to a
// rounding boundary to trust, and also if there's a chance of underflow (so
// that we can reliably determine the rounding direction, including whether
// the quotient was exact).
//
// Regarding the input mantissas x,y and our approximate quotient q as
// integers in [2^52,2^53), the quotient is an approximation to either
// x*2^52/y or x*2^53/y, depending on which of x,y was larger. We know that q
// is less than the true value of that quotient by at most a small fraction
// of a ULP. So the correct rounded quotient is either equal to q or to q+1,
// and we can decide which by multiplying back up by y: we want q - x*2^k/y
// to be in the range (-1/2,+1/2) (where k = 52 or 53), which is equivalent
// to asking if qy - x*2^k is in the range (-y/2,+y/2).
//
// That's a calculation we can do in integers using only addition and
// multiplication. And we know that if q itself doesn't have that property
// then q+1 will.
// The mantissa of y is currently right at the top of the word, which means
// that if the result of our check is greater than it, it will overflow. So
// we must start by shifting y downward. We'll put it back at the bottom of
// the word, where it was in the input float.
lsr yl, yl, #11 // shift yl right
orr yl, yl, yh, lsl #21 // OR in the bits shifted out of yh
lsr yh, yh, #11 // shift yh right
// Compute the integer qy-x. Because q is already very close to the right
// quotient, we expect this to be an integer at most twice the size of y,
// which easily fits in 64 bits. So we don't need to compute the full 128-bit
// product: the low 64 bits are enough.
umull r8, r6, r7, yl // 64-bit product of the low words
mla r6, r7, yh, r6 // + (high word of y) * (low word of q)
mla r6, r5, yl, r6 // + (high word of q) * (low word of y)
// Now we must subtract either x << 53 or x << 52. This will only affect the
// high word of the product we've just computed. Also the mantissa of x is
// already shifted left by 11. So we shift xl left by either (52-32-11) or
// (53-32-11), i.e. by 9 or by 10, and subtract from the high word of the
// product.
//
// To decide which, we consult the value left in lr by the original test for
// renormalization, which added 0x40000000 to the high word of the initial
// approximate quotient 'quot'. If that had bit 62 set (so no renormalization
// needed) then the addition carried into the sign bit; otherwise it didn't.
// So lr is positive if and only if we need to shift xl left by an extra bit.
tst lr, lr // did we renormalize?
subpl r6, r6, xl, lsl #10 // if so, subtract x<<53 from q*y
submi r6, r6, xl, lsl #9 // if not, subtract x<<52
// Now r6:r8 contains the residual value r = qy - x*2^k as described above.
// If this is between -y/2 and +y/2 then q is already the correctly rounded
// quotient. Otherwise, the correct quotient is q+1, so the value in r6:r8
// will be too small (incrementing q would add y to it). So we need to check
// whether r < -y/2, or equivalently whether 2r < -y (avoiding having to
// worry about what happens when we halve y if it's odd).
//
// As mentioned above, division can't give an exact halfway case, so we don't
// need to worry about the case r = y/2.
adds r8, r8, r8 // multiply the residual by 2
adc r6, r6, r6
adds lr, r8, yl // add y to it, discarding the result
adcs lr, r6, yh
bpl LOCAL_LABEL(ddiv_corrected) // if the answer is positive, we're OK
// If we didn't take that branch, then the approximate quotient is too small
// by 1, so we must increment it. But also, we adjust the residual in r6:r8
// to match. That residual is unused by the main epilogue code, but we also
// came here for any underflowing value, and the underflow handler will need
// the exact residual to determine the rounding direction.
//
// (We could re-test whether underflow had happened and use that to skip the
// update of r6:r8, but the test would cost as much effort as it saved!)
adds r7, r7, #1 // increment the output quotient
adcs r5, r5, #0
adds r8, r8, yl // repeat the addition of y to the residual,
adcs r6, r6, yh // this time keeping the result in r6:r8
b LOCAL_LABEL(ddiv_corrected) // finally we can rejoin the main code
LOCAL_LABEL(ddiv_result_is_power_of_2):
// The special-case handler for the two input mantissas being equal, so that
// the result is an exact power of two. We set up all the output registers to
// the way the main code would have done it, and jump straight to
// ddiv_corrected. This includes setting r6:r8 to the 'residual' value
// computed by the slow path, in case this power-of-2 output is also an
// underflow, which will depend on those registers.
mov r5, #0x00100000 // high word of quotient mantissa = 1<<20
mov r7, #0 // low word of quotient mantissa = 0
mov r6, #0 // high word of residual = 0
mov r8, #0 // low word of residual = 0
b LOCAL_LABEL(ddiv_corrected)
LOCAL_LABEL(ddiv_underflow):
// We come here to handle underflow. The output double, constructed naïvely
// from the out-of-range exponent, is in xh:xl. We expect in this situation
// that we've _always_ come via either the ddiv_correction slow path or the
// ddiv_result_is_power_of_2 special case, both of which will have set up a
// residual value in r6:r8 equal to q*y - x*2^k (for appropriate k). This
// value is positive if the quotient is slightly above the true value (i.e.
// was rounded up), or negative if the quotient was rounded down. But we must
// also distinguish the third case of the residual being exactly zero.
add xh, xh, #0x60000000 // apply IEEE 754 exponent bias for __dunder
orrs r12, r6, r8 // set r12=0 and Z=1 if quotient was exact
movne r12, #1 // otherwise, set r12 = +1
orrne r12, r12, r6, asr #31 // and change to -1 if residual is negative
pop {r4,r5,r6,r7,r8,lr} // pop all locally saved registers
b SYMBOL_NAME(__compiler_rt_dunder) // and tailcall __dunder to finish
LOCAL_LABEL(ddiv_zerodenorm):
// We come here if either input had exponent 0, so there's at least one zero
// or denormal. However, we know there are no infinities or NaNs, because
// those were checked first and will have gone to ddiv_naninf below.
//
// First we must repeat the instruction which extracted the exponent of y
// into r5, this time unconditionally, in case the setup code didn't do it.
and r5, r12, yh, lsr #4
// If either or both input is actually zero, the answer is easy.
orrs lr, xl, xh, lsl #1 // is x zero?
beq LOCAL_LABEL(ddiv_xzero)
orrs lr, yl, yh, lsl #1 // is y zero?
beq LOCAL_LABEL(ddiv_divbyzero)
// Otherwise, delegate to __dnorm2 to handle denormals, converting them into
// a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
// exponents at the bottom of their words instead of half way up, so shift
// down first.
lsr r4, r4, #16
lsr r5, r5, #16
push {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
mov r0, sp // pass it by address
bl SYMBOL_NAME(__compiler_rt_dnorm2)
pop {r0, r1, r2, r3, r4, r5}
// Rejoin the main code, with the exponent difference in the top half of r4,
// and the output sign in the low bit of r4. (The original setup code did the
// latter, but we clobbered it while setting up for __dnorm2.)
subs r4, r4, r5 // exponent difference, at the bottom of r4
lsls r4, r4, #16 // move it up to the right place
orr r4, r4, r6, lsr #31 // recover output sign from top bit of r6
b LOCAL_LABEL(ddiv_normalised) // rejoin the main code
LOCAL_LABEL(ddiv_xzero):
// We come here if x=0. We return 0 (of the right sign) if y is not 0, and
// the default quiet NaN if both inputs are zero.
orrs lr, yl, yh, lsl #1 // is y zero?
beq LOCAL_LABEL(ddiv_ivo_pop) // if so, pop registers and return a NaN
// We know xl=0 already, so we only need to reset xh to contain the right
// output sign. The setup code left that in the high bit of r6.
and xh, r6, #0x80000000
pop {r4,r5,r6,r7,r8,pc}
LOCAL_LABEL(ddiv_divbyzero):
// We come here if y=0, but x is not 0 (or we'd have gone to ddiv_xzero above
// instead). So we're dividing a nonzero number by zero, and must return
// infinity.
pop {r4,r5,r6,r7,r8,lr}
eor xh, xh, yh // combine signs to get result sign
b LOCAL_LABEL(ddiv_retinf)
LOCAL_LABEL(ddiv_naninf):
// We come here knowing that at least one operand is either NaN or infinity.
// If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
// stacked registers first: we won't need that much spare space any more, and
// it makes the tailcall easier if we've already done it.
pop {r4,r5,r6,r7,r8,lr}
// A number is a NaN if its exponent is 0x7ff and at least one bit below that
// is set. The CMP + ADC pair here converts the two words xh:xl into a single
// word containing xh shifted up by one (throwing away the sign bit which
// makes no difference), with its low bit set if xl was nonzero. So if that
// is strictly greater than 0xffe00000, then x was a NaN.
cmp xl, #1
adc r12, xh, xh
cmp r12, #0xFFE00000
bhi SYMBOL_NAME(__compiler_rt_dnan2)
// Now check y in the same way.
cmp yl, #1
adc r12, yh, yh
cmp r12, #0xFFE00000
bhi SYMBOL_NAME(__compiler_rt_dnan2)
// Now we know there are no NaNs. Therefore there's at least one infinity. If
// both operands are infinity then we have inf / inf = invalid operation and
// must return a NaN. We detect this by XORing the inputs' exponent fields:
// knowing one of them is 7FF, they XOR to zero iff the other one is too.
eors r12, xh, yh // XOR entire top words of the inputs
lsl r12, r12, #1 // shift left to discard the sign bit
lsrs r12, r12, #21 // shift right again to discard mantissas
beq LOCAL_LABEL(ddiv_ivo) // if what's left is 0, we have inf / inf
// Otherwise, there's exactly one infinity, so our answers are easy, but
// depend on which operand it is:
// infinity / anything = infinity
// anything / infinity = 0
//
// Determine if x is the infinity, by bitwise inverting the whole word and
// then shifting left and right to isolate its exponent bits.
mvn r12, xh, lsl #1 // invert x, shift left to discard sign
lsrs r12, r12, #21 // and shift right to discard mantissa
eor xh, xh, yh // calculate the output sign bit
beq LOCAL_LABEL(ddiv_retinf) // if x = inf, return infinity of that sign
mov xl, #0 // otherwise clear all bits of x
and xh, xh, #0x80000000 // other than the sign bit
bx lr // and return zero of the same sign
LOCAL_LABEL(ddiv_retinf):
// Construct and return an infinity in xh:xl, with whatever sign bit is
// already in the top bit of xh.
mov xl, #0 // clear low word
mvn xh, xh, lsr #31 // shift xh[31] down to bit 0, inverted
mvn xh, xh, lsl #11 // uninvert, and put exponent 0x7ff below it
lsl xh, xh, #20 // shift back up to the top
bx lr
// Code to construct and return the default quiet NaN, for the cases inf/inf
// and 0/0. We provide two entry labels, one for callers who still need to
// pop all the registers this function pushed, and one for callers who have
// done that already.
LOCAL_LABEL(ddiv_ivo_pop):
pop {r4,r5,r6,r7,r8,lr}
LOCAL_LABEL(ddiv_ivo):
movw xh, 0x7ff8
lsls xh, xh, #16
mov xl, #0
bx lr
END_COMPILERRT_FUNCTION(__aeabi_ddiv)
// Table of approximate reciprocals.
CONST_SECTION
LOCAL_LABEL(reciptbl):
.byte 0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF4,0xF2
.byte 0xF0,0xEE,0xED,0xEB,0xE9,0xE8,0xE6,0xE4
.byte 0xE3,0xE1,0xE0,0xDE,0xDD,0xDB,0xDA,0xD8
.byte 0xD7,0xD5,0xD4,0xD3,0xD1,0xD0,0xCF,0xCD
.byte 0xCC,0xCB,0xCA,0xC8,0xC7,0xC6,0xC5,0xC4
.byte 0xC2,0xC1,0xC0,0xBF,0xBE,0xBD,0xBC,0xBB
.byte 0xBA,0xB9,0xB8,0xB7,0xB6,0xB5,0xB4,0xB3
.byte 0xB2,0xB1,0xB0,0xAF,0xAE,0xAD,0xAC,0xAB
.byte 0xAA,0xA9,0xA8,0xA8,0xA7,0xA6,0xA5,0xA4
.byte 0xA3,0xA3,0xA2,0xA1,0xA0,0x9F,0x9F,0x9E
.byte 0x9D,0x9C,0x9C,0x9B,0x9A,0x99,0x99,0x98
.byte 0x97,0x97,0x96,0x95,0x95,0x94,0x93,0x93
.byte 0x92,0x91,0x91,0x90,0x8F,0x8F,0x8E,0x8E
.byte 0x8D,0x8C,0x8C,0x8B,0x8B,0x8A,0x89,0x89
.byte 0x88,0x88,0x87,0x87,0x86,0x85,0x85,0x84
.byte 0x84,0x83,0x83,0x82,0x82,0x81,0x81,0x80
NO_EXEC_STACK_DIRECTIVE