lib/builtins/arm/divdf3.S - llvm-project/compiler-rt - Git at Google

 //===-- divdf3.S - double-precision floating point division ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the __divdf3 function (double precision floating point
 // division), with the IEEE-754 default rounding (to nearest, ties to even),
 // for the Arm and Thumb2 ISAs.
 //
 //===----------------------------------------------------------------------===//

 #include "../assembly.h"
 #include "crt_endian.h"

 // The basic strategy of this division code is to use Newton-Raphson iteration
 // to calculate an approximation to 1/y, then multiply it by x. This procedure
 // delivers a quotient with 10 extra bits of precision, but which isn't exact.
 // We know an upper bound on its possible error, which gives an interval of
 // possible values for the true quotient. So we can check the 10 extra bits to
 // see whether a rounding boundary lies within the interval. If not, then we
 // can round and return without worrying further; otherwise, we go to slower
 // correction code that multiplies the approximate quotient back up by y and
 // checks it against x.
 //
 // This strategy depends critically on the upper bound on the approximation
 // error. Underestimating the error introduces a bug; overestimating it costs
 // performance, by sending more cases than necessary to the slow path.
 //
 // To give high confidence of its correctness, the upper bound has been proved
 // formally by Gappa. The Gappa proof and auxiliary code are not included in
 // this version, but they can be found in the Arm Optimized Routines repository
 //
 // https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/at32/ddiv.S
 // https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-prove.py
 // https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-diagnostics.c
 //
 // and a pair of blog posts describing the concepts and procedure are here:
 //
 // https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p1
 // https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p2

   .syntax unified
   .text
   .p2align 2

 #if __ARM_PCS_VFP
 DEFINE_COMPILERRT_FUNCTION(__divdf3)
   push {r4, lr}
   VMOV_FROM_DOUBLE(r0, r1, d0)
   VMOV_FROM_DOUBLE(r2, r3, d1)
   bl __aeabi_ddiv
   VMOV_TO_DOUBLE(d0, r0, r1)
   pop {r4, pc}
 #else
 DEFINE_COMPILERRT_FUNCTION_ALIAS(__divdf3, __aeabi_ddiv)
 #endif

 DEFINE_COMPILERRT_FUNCTION(__aeabi_ddiv)

   push    {r4,r5,r6,r7,r8,lr}

   // Check if either input exponent 7FF (infinity or NaN), and if so, branch
   // out of line.
   ldr     r12, =0x07FF0000        // mask for exponent cold storage
   bics    r4, r12, xh, lsr #4     // test for Infs or NaNs
   bicsne  r4, r12, yh, lsr #4
   beq     LOCAL_LABEL(ddiv_naninf)

   // Extract the exponents of the input values x and y into bits 16..26 of r14
   // and r5 respectively, and in the process, check if either exponent is zero
   // (so that one or both inputs are 0 or denormal). In order to combine the
   // two tests, the second ANDS is performed conditionally, so that if x's
   // exponent is zero then the out-of-line code at ddiv_zerodenorm might find
   // y's exponent hasn't been set up yet.
   //
   // We also calculate the sign of the result, which will be needed whether or
   // not we branch. This is saved in the low bit of r4.
   ands    r4, r12, xh, lsr #4     // get exponent of x, setting Z if it's 0
   andsne  r5, r12, yh, lsr #4     // if not, extract and test exponent of y
   eor     r6, xh, yh              // XOR the input signs to get the result sign
   orr     r4, r4, r6, lsr #31     // save it in the low bit of r4
   beq     LOCAL_LABEL(ddiv_zerodenorm)         // branch out of line for zeroes or denormals

   // Calculate the initial exponent of the result, by subtracting the two input
   // exponents and adjusting for the IEEE exponent bias. This value may have to
   // be adjusted by 1 later, depending on the quotient of the mantissas.
   //
   // If we branched to ddiv_zerodenorm above, and it found denormals but no
   // zeroes, it may branch back here after renormalising them. We expect the
   // out-of-line code to have left the exponent difference in the top half of
   // r4 (still with the output sign in the low bit), but not yet to have
   // applied the bias. So it branches back in immediately after the SUB.
   //
   // The exponent bias we want is either 0x3fe or 0x3ff, depending on whether
   // we have to shift the output mantissa by 1 below. Neither of those values
   // fits in the immediate field of an ADD instruction, so we must use two
   // instructions.
   sub     r4, r4, r5
 LOCAL_LABEL(ddiv_normalised): // denormal handler will come back to here
   add     r4, r4, #0x03FC0000     // add the 8 high bits of the bias 0x3FE
   add     r4, r4, #0x00020000     // add the remaining bit of the bias

   // Shift both mantissas up to the top of their 64-bit register pair, and OR
   // in the leading 1 bit, which will occupy the high bit of the high word in
   // each case.
   mov     r5, #(1 << 31)          // high bit for ORing in to both mantissas
   orr     xh, r5, xh, lsl #11     // shift up xh and OR in the high bit
   orr     yh, r5, yh, lsl #11     // same for yh
   orr     xh, xh, xl, lsr #21     // OR in the bits shifted out of xl into xh
   orr     yh, yh, yl, lsr #21     // same for yl and yh
   lsl     xl, xl, #11             // shift up the rest of xl
   lsl     yl, yl, #11             // same for yl

   // Check if the two mantissas are exactly equal, so that the quotient is
   // exactly a power of 2. If so, branch out of line to handle that case
   // specially.
   //
   // This guarantees that when we examine the approximate quotient afterwards,
   // we can't be confused about whether it needs to be renormalised, which
   // would otherwise cost just as much effort as this check. Our reciprocal
   // approximation is always an underestimate (that's in the nature of this
   // particular Newton-Raphson iteration), so if x < y (meaning the mantissas
   // rather than the whole floats) then even the true quotient will be less
   // than 1, and the approximation even more so. On the other hand, if x > y,
   // then the true quotient will be enough greater than 1 that even the largest
   // possible error in the approximation can't make it look like less than 1.
   //
   // (Proof: regard x,y as normalised to the range [1,2). If x > y, then we
   // have x ≥ y+ε, where ε is the machine epsilon. So x/y ≥ 1+ε/y > 1+ε/2. And
   // the bound on the approximation error, given below, is far less than ε/2.)
   cmp     xh, yh
   cmpeq   xl, yl
   beq     LOCAL_LABEL(ddiv_result_is_power_of_2)

   // Now we begin the actual calculation of the reciprocal approximation.
   //
   // We begin with our two input mantissas stored in xh:xl and yh:yl, each with
   // its leading 1 explicit and shifted up to the top of the word. So they can
   // be regarded as 64-bit integers with the high bit set and the bottom 11
   // bits clear.

   // Obtain an 8-bit reciprocal approximation by using the topmost 8 bits of y
   // as a lookup table. The top bit of y is always set, so there are only 128
   // lookup table entries, not 256. The 8-bit value we load also has its top
   // bit set.
   lsr     r5, yh, #24           // r5 is the table index plus 0x80

   // Get the address of reciptbl, in various ways depending on position-
   // independence and Arm/Thumb state.
   //
   // Since the table index calculated above in r5 includes the high mantissa
   // bit, an index of 0x80 refers to the first table entry and 0xFF the last.
   // So we subtract 0x80 from the table address to compensate.
 #if defined __pic__ || defined __PIC__ || defined __ARM_ROPI
   // In PIC or ROPI modes, we must construct the address in a pc-relative
   // manner, by making a literal containing the offset from the current code.
   // The reference point for that offset is the value of pc as read by the add
   // instruction at get_reciptbl below, which will be 4 or 8 bytes after it in
   // Thumb or Arm state respectively.
 #if __thumb__
   ldr     r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+4)
 #else
   ldr     r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+8)
 #endif
 LOCAL_LABEL(get_reciptbl):
   add     r6, r6, pc
 #else
   // If we're not building for position independence, we can just load the
   // target address directly.
   ldr     r6, =(LOCAL_LABEL(reciptbl)-0x80)
 #endif

   ldrb    r6, [r6, r5]          // and load the approximation into r6

   // First Newton-Raphson iteration, which expands that 8-bit approximation to
   // a 17-bit one, again with its top bit set. We use the top 16 bits of y for
   // this, so that we can fit the multiplications into ordinary MUL rather than
   // UMULL.
   //
   // The Newton-Raphson formula to turn an approximation r ≈ 1/y into a better
   // one is r → r(2-yr). In this case we're scaling up to integers (informal
   // fixed point), so the 2 becomes 2^24.
   lsr     r5, yh, #16           // get top halfword of y
   mul     r7, r6, r5            // multiply it by the input value r
   rsb     r7, r7, #(1 << 24)    // subtract from 2 (scaled up appropriately)
   mul     r7, r6, r7            // multiply again to make r(2-yr)
   lsr     r7, r7, #14           // shift down to keep only 17 bits of it

   // Second iteration, expanding into a 32-bit reciprocal, using the top 31
   // bits of y (i.e. yh shifted by 1). The first multiplication (making yr) is
   // 32x32 → 64 bits, so we use a single UMULL; the second one making r(2-yr)
   // is 32x64, which we do with a UMULL by the bottom half of yr and then MLA
   // by the top half, so we only keep the low 64 bits of the full answer.
   //
   // The subtraction from 2 (again scaled up, this time to 2^48) is done by
   // RSBS+RSC, interleaved with the multiplications so as to use a delay slot
   // on CPUs that have one.
   lsr     r12, yh, #1
   umull   r6, r8, r7, r12       // r8:r6 = yr
   rsbs    r6, r6, #0            // low half of subtraction from 2
   umull   r12, lr, r7, r6       // multiply r by the low half of 2-yr
 #if __thumb__
   // Thumb has no RSC, so simulate it by bitwise inversion and then ADC
   mvn     r8, r8
   adc     r8, r8, #(1 << 16)
 #else
   rsc     r8, r8, #(1 << 16)    // high half of subtraction from 2
 #endif
   mla     r6, r7, r8, lr        // multiply r by the high half of 2-yr

   // Third iteration, expanding into a 64-bit reciprocal, with the leading bit
   // expected to end up in bit 60. Now the first multiplication to make yr is
   // 32x64 → 96 bits, so we put the product in three registers lr:r12:r8.
   // However, we're going to discard the low word r8 completely, because it
   // makes negligible difference. So we'll treat the output yr as 64-bit.
   umull   r8, r12, r6, yl       // multiply r by bottom half of y
   mov     lr, #0                // initialize high word to 0
   umlal   r12, lr, r6, yh       // multiply r by top half of y
   // Subtract from a power of 2, as usual. But in this case the power of 2
   // we're subtracting from is 2^64, which is just off the top of the 64-bit
   // value in lr:r12. So in fact we're just negating the whole thing!
   //
   // To preserve the invariant that the approximation error is always negative,
   // we negate via one's complement rather than two's. (This would only make a
   // difference if r8 had happened to be exactly 0. That in turn can occur when
   // yl=0, so one of the test cases in ddiv-diagnostics.c deliberately uses
   // such a value, so that the intermediate results can be checked against the
   // reference Python.)
   mvn     r12, r12
   mvn     lr, lr
   // Now lr:r12:r8 contains 2-yr. We discard the low word r8 to reduce that to
   // 64 bits, and do another 32x64 → 96 bit multiplication.
   umull   r5, r8, r6, r12      // multiply r by bottom half of 2-yr
   mov     r7, #0               // initialize high word to 0
   umlal   r8, r7, r6, lr       // multiply r by top half of 2-yr

   // That's the Newton-Raphson iteration done: we have a 64-bit approximation
   // to 1/y. Multiply it by x to get the full approximate quotient.
   //
   // In principle, this would be a 64x64 → 128 bit multiplication, involving
   // four long multiply instructions. But we only need the top 64 bits, and
   // we're already prepared to tolerate some error in the calculations, so we
   // cut corners: don't multiply the two low words together at all, and we
   // discard the bottom half of each of the (low * high) partial products
   // without bothering to propagate carries out of it.
   //
   // (All of these shortcuts are faithfully mimicked in the Python reference
   // implementation which generates Gappa input, so they're all accounted for
   // in the error analysis.)
 #if __ARM_FEATURE_DSP
   umull   r12, r6, xh, r8      // r6 = high word of x * low word of 1/y
   umull   r12, r5, xl, r7      // r5 = low word of x * high word of 1/y
   umaal   r6, r5, xh, r7       // add those to the product of both high words
 #else
   // Alternative instruction sequence using UMLAL, if UMAAL isn't available
   umull   r12, r6, xh, r8      // r6 = high word of x * low word of 1/y
   umull   r12, lr, xl, r7      // lr = low word of x * high word of 1/y
   adds    r6, r6, lr           // add those together
   mov     r5, #0               // set r5 to the carry out of that addition
   adc     r5, r5, #0
   umlal   r6, r5, xh, r7       // add that to the product of both high words
 #endif
   // Now r5:r6 is the completed approximate quotient, with its leading bit at
   // position either 61 or 62.

   // Normalize so that the leading bit is always in bit 60, by shifting left if
   // it isn't there already, and adjusting the output exponent by 1 to
   // compensate.
   //
   // We do the test in a slightly tricky way, by arranging to set the V flag if
   // the leading bit is in bit 60. This allows us to do the left shift under
   // the VC condition, which is convenient because the LSLS instruction that
   // shifts the low word left moves the top bit into the C flag without
   // affecting V.
   //
   // We also save the value written into lr by the initial ADDS instruction,
   // because that contains enough information to tell us whether we
   // renormalised here. The correction path for quotients too close to a
   // rounding boundary will need to recover that information.
   adds    lr, r5, #0x40000000  // set V flag if bit 62 of high word set
   subvc   r4, r4, #(1 << 16)   // if not, correct the exponent by 1,
   lslsvc  r6, r6, #1           // shift the low word of the quotient left
   adcvc   r5, r5, r5           // and shift its top bit into the high word

   // Now r5:r6 is the _normalised_ approximate quotient, with its leading bit
   // reliably in bit 60. This is the final output of the calculation that the
   // Gappa error-analysis proof applies to.

   // That 64-bit output has bit 63 clear; the leading 1 bit of the output
   // mantissa in bit 62, followed by 52 more mantissa bits; then 10 bits at the
   // bottom which are used for determining rounding.
   //
   // Compute the _approximately_ rounded-to-nearest output mantissa, by adding
   // half a ULP and shifting down. If we don't go to the slow path, this is the
   // correct output mantissa. (See fdiv.S for the proof that the round-to-even
   // tiebreaking case can't occur in floating-point division.)
   //
   // We keep the original version of r6, containing the ten rounding bits, so
   // that we can test it to see if we need the slow path.
   adds    r7, r6, #(1 << 9)    // add half a ULP, copying low word into r7
   adc     r5, r5, #0           // propagate carry into high word
   lsr     r7, r7, #10          // shift low word right
   orr     r7, r7, r5, lsl #22  // combine with bits shifted out of high word
   lsr     r5, r5, #10          // shift high word right

   // Now test r6 to see whether this output mantissa can be relied on, or
   // whether the approximation landed too close to a rounding boundary.
   //
   // The maximum possible error in the approximation, taking into account the
   // initial error in each lookup table entry, the remaining mathematical error
   // introduced by stopping after this many Newton-Raphson iterations, and
   // every shortcut, right shift, truncation and discarding of a partial
   // product in the algorithm above, is always negative, and less than 64 units
   // in the last place of the 64-bit approximate quotient. That is, the true
   // quotient lies somewhere between the 64-bit integer described as "final
   // output of the calculation" above, and that plus 64.
   //
   // So if the bottom 10 bits of r6 have the value 2^9 or greater, we're safe,
   // because the true value is _larger_ than the approximation, so if the
   // approximation is already above the rounding boundary then so is the true
   // value. And if those 10 bits are (2^9-64) or less then we're also safe,
   // because even if the true value is greater by 63, it's still on the same
   // side of the rounding boundary.
   //
   // We check the error by subtracting (2^9-64), so that the dangerous values
   // of the bottom 10 bits are those in the range 0,...,63, i.e. precisely
   // those with none of bits 6,7,8,9 set.
   //
   // We also combine this test with a check for underflow, because that also
   // needs more careful handling (the mantissa must be re-rounded to a
   // different bit position, which involves knowing whether it's exact).
   // Underflow has happened if the exponent in the top half of r4 is negative
   // (it's off by 1 so that the leading mantissa bit will increment it), so we
   // test by an ASR#31 (copying the top bit of r4 into all of it) and negating.
   // That way, the output value is zero on underflow, matching the flags from
   // the other check.
   sub     r6, r6, #(1 << 9)-64
   tst     r6, #0x3C0              // now EQ means we must go to the slow path
   mvnsne  r12, r4, asr #31        // also set EQ if underflow has happened
   beq     LOCAL_LABEL(ddiv_correction)         // branch out of line to do the hard bit

   // If we do go to ddiv_correction, it branches back here after the correction
   // code has finished. Either way, we expect that r5:r7 is the result
   // mantissa, with the top bit set, already in the correct position in the
   // word, and already rounded to nearest.
 LOCAL_LABEL(ddiv_corrected):
   // Recombine the output mantissa with the sign and exponent.
   add     xh, r5, r4, lsl #31     // add sign bit to top word of mantissa
   bic     r12, r4, #1             // isolate exponent in top half of r4
   add     xh, xh, r12, lsl #4     // add exponent to make the final high word
   mov     xl, r7                  // move low word into the right register

   // If there's no overflow or underflow, we're done.
   //
   // We _identified_ underflow above when we went to the slow path, but having
   // done that, the slow path came back here, so we must check for it again.
   // (The only purpose of the detour was to obtain accurate information about
   // whether the quotient is exact, or needed rounding.)
   //
   // The output exponent, offset downwards by 1, is in the top half of r4. If
   // it's negative, there's an underflow; if it's too large, there's an
   // overflow. We do an approximate test for both at once via an unsigned
   // comparison against 0x7f0, using r12 (the register in which we already
   // cleared the sign bit stored at the bottom). This identifies _most_ normal
   // outputs as quickly as possible.
   //
   // 0x7f0 isn't the maximum possible known-safe exponent, but it's the largest
   // one that fits in the immediate field of CMP. We deal with the remaining
   // cases in the next few instructions.
   cmp     r12, #(0x7f0 << 16)
   popls   {r4,r5,r6,r7,r8,pc}

   // Now check the remaining cases more carefully.
   //
   // If r12 < 0 then we definitely have underflow. We detect overflow precisely
   // by seeing if the _final_ output exponent (in the output register xh) is
   // 0x7ff or more, by incrementing it and seeing if the sign is opposite from
   // the intended output sign.
   add     lr, xh, #(1 << 20)      // increment the output exponent field
   teq     lr, r4, lsl #31         // set N if the sign now doesn't match r4[0]
   tstpl   r12, r12                // otherwise, set N if underflow
   poppl   {r4,r5,r6,r7,r8,pc}     // if neither, we've finished

   // If we still haven't returned, we really do have overflow or underflow, and
   // the sign of r12 tells us which.
   tst     r12, r12
   bmi     LOCAL_LABEL(ddiv_underflow)
   // For overflow, correct the sign by biasing the exponent downward, and go to
   // code that constructs an infinite return value (shared with the
   // division-by-zero handler).
   sub     xh, xh, #0x60000000
   pop     {r4,r5,r6,r7,r8,lr}     // ddiv_retinf expects no regs on the stack
   b       LOCAL_LABEL(ddiv_retinf)

 LOCAL_LABEL(ddiv_correction):
   // The slow path, entered if the approximate quotient was too close to a
   // rounding boundary to trust, and also if there's a chance of underflow (so
   // that we can reliably determine the rounding direction, including whether
   // the quotient was exact).
   //
   // Regarding the input mantissas x,y and our approximate quotient q as
   // integers in [2^52,2^53), the quotient is an approximation to either
   // x*2^52/y or x*2^53/y, depending on which of x,y was larger. We know that q
   // is less than the true value of that quotient by at most a small fraction
   // of a ULP. So the correct rounded quotient is either equal to q or to q+1,
   // and we can decide which by multiplying back up by y: we want q - x*2^k/y
   // to be in the range (-1/2,+1/2) (where k = 52 or 53), which is equivalent
   // to asking if qy - x*2^k is in the range (-y/2,+y/2).
   //
   // That's a calculation we can do in integers using only addition and
   // multiplication. And we know that if q itself doesn't have that property
   // then q+1 will.

   // The mantissa of y is currently right at the top of the word, which means
   // that if the result of our check is greater than it, it will overflow. So
   // we must start by shifting y downward. We'll put it back at the bottom of
   // the word, where it was in the input float.
   lsr     yl, yl, #11             // shift yl right
   orr     yl, yl, yh, lsl #21     // OR in the bits shifted out of yh
   lsr     yh, yh, #11             // shift yh right

   // Compute the integer qy-x. Because q is already very close to the right
   // quotient, we expect this to be an integer at most twice the size of y,
   // which easily fits in 64 bits. So we don't need to compute the full 128-bit
   // product: the low 64 bits are enough.
   umull   r8, r6, r7, yl          // 64-bit product of the low words
   mla     r6, r7, yh, r6          //   + (high word of y) * (low word of q)
   mla     r6, r5, yl, r6          //   + (high word of q) * (low word of y)

   // Now we must subtract either x << 53 or x << 52. This will only affect the
   // high word of the product we've just computed. Also the mantissa of x is
   // already shifted left by 11. So we shift xl left by either (52-32-11) or
   // (53-32-11), i.e. by 9 or by 10, and subtract from the high word of the
   // product.
   //
   // To decide which, we consult the value left in lr by the original test for
   // renormalization, which added 0x40000000 to the high word of the initial
   // approximate quotient 'quot'. If that had bit 62 set (so no renormalization
   // needed) then the addition carried into the sign bit; otherwise it didn't.
   // So lr is positive if and only if we need to shift xl left by an extra bit.
   tst     lr, lr                  // did we renormalize?
   subpl   r6, r6, xl, lsl #10     // if so, subtract x<<53 from q*y
   submi   r6, r6, xl, lsl #9      // if not, subtract x<<52

   // Now r6:r8 contains the residual value r = qy - x*2^k as described above.
   // If this is between -y/2 and +y/2 then q is already the correctly rounded
   // quotient. Otherwise, the correct quotient is q+1, so the value in r6:r8
   // will be too small (incrementing q would add y to it). So we need to check
   // whether r < -y/2, or equivalently whether 2r < -y (avoiding having to
   // worry about what happens when we halve y if it's odd).
   //
   // As mentioned above, division can't give an exact halfway case, so we don't
   // need to worry about the case r = y/2.
   adds    r8, r8, r8              // multiply the residual by 2
   adc     r6, r6, r6
   adds    lr, r8, yl              // add y to it, discarding the result
   adcs    lr, r6, yh
   bpl     LOCAL_LABEL(ddiv_corrected)          // if the answer is positive, we're OK

   // If we didn't take that branch, then the approximate quotient is too small
   // by 1, so we must increment it. But also, we adjust the residual in r6:r8
   // to match. That residual is unused by the main epilogue code, but we also
   // came here for any underflowing value, and the underflow handler will need
   // the exact residual to determine the rounding direction.
   //
   // (We could re-test whether underflow had happened and use that to skip the
   // update of r6:r8, but the test would cost as much effort as it saved!)
   adds    r7, r7, #1              // increment the output quotient
   adcs    r5, r5, #0
   adds    r8, r8, yl              // repeat the addition of y to the residual,
   adcs    r6, r6, yh              //   this time keeping the result in r6:r8
   b       LOCAL_LABEL(ddiv_corrected)          // finally we can rejoin the main code

 LOCAL_LABEL(ddiv_result_is_power_of_2):
   // The special-case handler for the two input mantissas being equal, so that
   // the result is an exact power of two. We set up all the output registers to
   // the way the main code would have done it, and jump straight to
   // ddiv_corrected. This includes setting r6:r8 to the 'residual' value
   // computed by the slow path, in case this power-of-2 output is also an
   // underflow, which will depend on those registers.
   mov     r5, #0x00100000         // high word of quotient mantissa = 1<<20
   mov     r7, #0                  // low word of quotient mantissa = 0
   mov     r6, #0                  // high word of residual = 0
   mov     r8, #0                  // low word of residual = 0
   b       LOCAL_LABEL(ddiv_corrected)

 LOCAL_LABEL(ddiv_underflow):
   // We come here to handle underflow. The output double, constructed naïvely
   // from the out-of-range exponent, is in xh:xl. We expect in this situation
   // that we've _always_ come via either the ddiv_correction slow path or the
   // ddiv_result_is_power_of_2 special case, both of which will have set up a
   // residual value in r6:r8 equal to q*y - x*2^k (for appropriate k). This
   // value is positive if the quotient is slightly above the true value (i.e.
   // was rounded up), or negative if the quotient was rounded down. But we must
   // also distinguish the third case of the residual being exactly zero.
   add     xh, xh, #0x60000000     // apply IEEE 754 exponent bias for __dunder
   orrs    r12, r6, r8             // set r12=0 and Z=1 if quotient was exact
   movne   r12, #1                 // otherwise, set r12 = +1
   orrne   r12, r12, r6, asr #31   // and change to -1 if residual is negative
   pop     {r4,r5,r6,r7,r8,lr}     // pop all locally saved registers
   b       SYMBOL_NAME(__compiler_rt_dunder)                // and tailcall __dunder to finish

 LOCAL_LABEL(ddiv_zerodenorm):
   // We come here if either input had exponent 0, so there's at least one zero
   // or denormal. However, we know there are no infinities or NaNs, because
   // those were checked first and will have gone to ddiv_naninf below.
   //
   // First we must repeat the instruction which extracted the exponent of y
   // into r5, this time unconditionally, in case the setup code didn't do it.
   and     r5, r12, yh, lsr #4

   // If either or both input is actually zero, the answer is easy.
   orrs    lr, xl, xh, lsl #1    // is x zero?
   beq     LOCAL_LABEL(ddiv_xzero)
   orrs    lr, yl, yh, lsl #1    // is y zero?
   beq     LOCAL_LABEL(ddiv_divbyzero)

   // Otherwise, delegate to __dnorm2 to handle denormals, converting them into
   // a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
   // exponents at the bottom of their words instead of half way up, so shift
   // down first.
   lsr     r4, r4, #16
   lsr     r5, r5, #16
   push    {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
   mov     r0, sp                   // pass it by address
   bl      SYMBOL_NAME(__compiler_rt_dnorm2)
   pop     {r0, r1, r2, r3, r4, r5}

   // Rejoin the main code, with the exponent difference in the top half of r4,
   // and the output sign in the low bit of r4. (The original setup code did the
   // latter, but we clobbered it while setting up for __dnorm2.)
   subs    r4, r4, r5               // exponent difference, at the bottom of r4
   lsls    r4, r4, #16              // move it up to the right place
   orr     r4, r4, r6, lsr #31      // recover output sign from top bit of r6
   b       LOCAL_LABEL(ddiv_normalised)          // rejoin the main code

 LOCAL_LABEL(ddiv_xzero):
   // We come here if x=0. We return 0 (of the right sign) if y is not 0, and
   // the default quiet NaN if both inputs are zero.
   orrs    lr, yl, yh, lsl #1       // is y zero?
   beq     LOCAL_LABEL(ddiv_ivo_pop)             // if so, pop registers and return a NaN
   // We know xl=0 already, so we only need to reset xh to contain the right
   // output sign. The setup code left that in the high bit of r6.
   and     xh, r6, #0x80000000
   pop     {r4,r5,r6,r7,r8,pc}

 LOCAL_LABEL(ddiv_divbyzero):
   // We come here if y=0, but x is not 0 (or we'd have gone to ddiv_xzero above
   // instead). So we're dividing a nonzero number by zero, and must return
   // infinity.
   pop     {r4,r5,r6,r7,r8,lr}
   eor     xh, xh, yh               // combine signs to get result sign
   b       LOCAL_LABEL(ddiv_retinf)

 LOCAL_LABEL(ddiv_naninf):
   // We come here knowing that at least one operand is either NaN or infinity.
   // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
   // stacked registers first: we won't need that much spare space any more, and
   // it makes the tailcall easier if we've already done it.
   pop     {r4,r5,r6,r7,r8,lr}

   // A number is a NaN if its exponent is 0x7ff and at least one bit below that
   // is set. The CMP + ADC pair here converts the two words xh:xl into a single
   // word containing xh shifted up by one (throwing away the sign bit which
   // makes no difference), with its low bit set if xl was nonzero. So if that
   // is strictly greater than 0xffe00000, then x was a NaN.
   cmp     xl, #1
   adc     r12, xh, xh
   cmp     r12, #0xFFE00000
   bhi     SYMBOL_NAME(__compiler_rt_dnan2)
   // Now check y in the same way.
   cmp     yl, #1
   adc     r12, yh, yh
   cmp     r12, #0xFFE00000
   bhi     SYMBOL_NAME(__compiler_rt_dnan2)

   // Now we know there are no NaNs. Therefore there's at least one infinity. If
   // both operands are infinity then we have inf / inf = invalid operation and
   // must return a NaN. We detect this by XORing the inputs' exponent fields:
   // knowing one of them is 7FF, they XOR to zero iff the other one is too.
   eors    r12, xh, yh              // XOR entire top words of the inputs
   lsl     r12, r12, #1             // shift left to discard the sign bit
   lsrs    r12, r12, #21            // shift right again to discard mantissas
   beq     LOCAL_LABEL(ddiv_ivo)                 // if what's left is 0, we have inf / inf

   // Otherwise, there's exactly one infinity, so our answers are easy, but
   // depend on which operand it is:
   //   infinity / anything = infinity
   //   anything / infinity = 0
   //
   // Determine if x is the infinity, by bitwise inverting the whole word and
   // then shifting left and right to isolate its exponent bits.
   mvn     r12, xh, lsl #1          // invert x, shift left to discard sign
   lsrs    r12, r12, #21            //   and shift right to discard mantissa
   eor     xh, xh, yh               // calculate the output sign bit
   beq     LOCAL_LABEL(ddiv_retinf)              // if x = inf, return infinity of that sign
   mov     xl, #0                   // otherwise clear all bits of x
   and     xh, xh, #0x80000000      //   other than the sign bit
   bx      lr                       //   and return zero of the same sign
 LOCAL_LABEL(ddiv_retinf):
   // Construct and return an infinity in xh:xl, with whatever sign bit is
   // already in the top bit of xh.
   mov     xl, #0                   // clear low word
   mvn     xh, xh, lsr #31          // shift xh[31] down to bit 0, inverted
   mvn     xh, xh, lsl #11          // uninvert, and put exponent 0x7ff below it
   lsl     xh, xh, #20              // shift back up to the top
   bx      lr

   // Code to construct and return the default quiet NaN, for the cases inf/inf
   // and 0/0. We provide two entry labels, one for callers who still need to
   // pop all the registers this function pushed, and one for callers who have
   // done that already.
 LOCAL_LABEL(ddiv_ivo_pop):
   pop     {r4,r5,r6,r7,r8,lr}
 LOCAL_LABEL(ddiv_ivo):
   movw    xh, 0x7ff8
   lsls    xh, xh, #16
   mov     xl, #0
   bx      lr

 END_COMPILERRT_FUNCTION(__aeabi_ddiv)

   // Table of approximate reciprocals.
 CONST_SECTION
 LOCAL_LABEL(reciptbl):
   .byte 0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF4,0xF2
   .byte 0xF0,0xEE,0xED,0xEB,0xE9,0xE8,0xE6,0xE4
   .byte 0xE3,0xE1,0xE0,0xDE,0xDD,0xDB,0xDA,0xD8
   .byte 0xD7,0xD5,0xD4,0xD3,0xD1,0xD0,0xCF,0xCD
   .byte 0xCC,0xCB,0xCA,0xC8,0xC7,0xC6,0xC5,0xC4
   .byte 0xC2,0xC1,0xC0,0xBF,0xBE,0xBD,0xBC,0xBB
   .byte 0xBA,0xB9,0xB8,0xB7,0xB6,0xB5,0xB4,0xB3
   .byte 0xB2,0xB1,0xB0,0xAF,0xAE,0xAD,0xAC,0xAB
   .byte 0xAA,0xA9,0xA8,0xA8,0xA7,0xA6,0xA5,0xA4
   .byte 0xA3,0xA3,0xA2,0xA1,0xA0,0x9F,0x9F,0x9E
   .byte 0x9D,0x9C,0x9C,0x9B,0x9A,0x99,0x99,0x98
   .byte 0x97,0x97,0x96,0x95,0x95,0x94,0x93,0x93
   .byte 0x92,0x91,0x91,0x90,0x8F,0x8F,0x8E,0x8E
   .byte 0x8D,0x8C,0x8C,0x8B,0x8B,0x8A,0x89,0x89
   .byte 0x88,0x88,0x87,0x87,0x86,0x85,0x85,0x84
   .byte 0x84,0x83,0x83,0x82,0x82,0x81,0x81,0x80

 NO_EXEC_STACK_DIRECTIVE