| //===----------------------Hexagon builtin routine ------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| // Double Precision Multiply |
| #define A r1:0 |
| #define AH r1 |
| #define AL r0 |
| #define B r3:2 |
| #define BH r3 |
| #define BL r2 |
| |
| #define BTMP r5:4 |
| #define BTMPH r5 |
| #define BTMPL r4 |
| |
| #define PP_ODD r7:6 |
| #define PP_ODD_H r7 |
| #define PP_ODD_L r6 |
| |
| #define ONE r9:8 |
| #define S_ONE r8 |
| #define S_ZERO r9 |
| |
| #define PP_HH r11:10 |
| #define PP_HH_H r11 |
| #define PP_HH_L r10 |
| |
| #define ATMP r13:12 |
| #define ATMPH r13 |
| #define ATMPL r12 |
| |
| #define PP_LL r15:14 |
| #define PP_LL_H r15 |
| #define PP_LL_L r14 |
| |
| #define TMP r28 |
| |
| #define MANTBITS 52 |
| #define HI_MANTBITS 20 |
| #define EXPBITS 11 |
| #define BIAS 1024 |
| #define MANTISSA_TO_INT_BIAS 52 |
| |
| // Some constant to adjust normalization amount in error code |
| // Amount to right shift the partial product to get to a denorm |
| #define FUDGE 5 |
| |
| #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG |
| #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG |
| #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG |
| #define END(TAG) .size TAG,.-TAG |
| |
| #define SR_ROUND_OFF 22 |
| .text |
| .global __hexagon_muldf3 |
| .type __hexagon_muldf3,@function |
| Q6_ALIAS(muldf3) |
| FAST_ALIAS(muldf3) |
| FAST2_ALIAS(muldf3) |
| .p2align 5 |
| __hexagon_muldf3: |
| { |
| p0 = dfclass(A,#2) |
| p0 = dfclass(B,#2) |
| ATMP = combine(##0x40000000,#0) |
| } |
| { |
| ATMP = insert(A,#MANTBITS,#EXPBITS-1) |
| BTMP = asl(B,#EXPBITS-1) |
| TMP = #-BIAS |
| ONE = #1 |
| } |
| { |
| PP_ODD = mpyu(BTMPL,ATMPH) |
| BTMP = insert(ONE,#2,#62) |
| } |
| // since we know that the MSB of the H registers is zero, we should never carry |
| // H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1 |
| // Adding 2 HLs, we get 2^64-3*2^32+2 maximum. |
| // Therefore, we can add 3 2^32-1 values safely without carry. We only need one. |
| { |
| PP_LL = mpyu(ATMPL,BTMPL) |
| PP_ODD += mpyu(ATMPL,BTMPH) |
| } |
| { |
| PP_ODD += lsr(PP_LL,#32) |
| PP_HH = mpyu(ATMPH,BTMPH) |
| BTMP = combine(##BIAS+BIAS-4,#0) |
| } |
| { |
| PP_HH += lsr(PP_ODD,#32) |
| if (!p0) jump .Lmul_abnormal |
| p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0? |
| p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0? |
| } |
| |
| // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts |
| // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so |
| |
| #undef PP_ODD |
| #undef PP_ODD_H |
| #undef PP_ODD_L |
| #define EXP10 r7:6 |
| #define EXP1 r7 |
| #define EXP0 r6 |
| { |
| if (!p1) PP_HH_L = or(PP_HH_L,S_ONE) |
| EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS) |
| EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS) |
| } |
| { |
| PP_LL = neg(PP_HH) |
| EXP0 += add(TMP,EXP1) |
| TMP = xor(AH,BH) |
| } |
| { |
| if (!p2.new) PP_HH = PP_LL |
| p2 = cmp.gt(TMP,#-1) |
| p0 = !cmp.gt(EXP0,BTMPH) |
| p0 = cmp.gt(EXP0,BTMPL) |
| if (!p0.new) jump:nt .Lmul_ovf_unf |
| } |
| { |
| A = convert_d2df(PP_HH) |
| EXP0 = add(EXP0,#-BIAS-58) |
| } |
| { |
| AH += asl(EXP0,#HI_MANTBITS) |
| jumpr r31 |
| } |
| |
| .falign |
| .Lpossible_unf: |
| // We end up with a positive exponent |
| // But we may have rounded up to an exponent of 1. |
| // If the exponent is 1, if we rounded up to it |
| // we need to also raise underflow |
| // Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000 |
| // And the PP should also have more than one bit set |
| // |
| // Note: ATMP should have abs(PP_HH) |
| // Note: BTMPL should have 0x7FEFFFFF |
| { |
| p0 = cmp.eq(AL,#0) |
| p0 = bitsclr(AH,BTMPL) |
| if (!p0.new) jumpr:t r31 |
| BTMPH = #0x7fff |
| } |
| { |
| p0 = bitsset(ATMPH,BTMPH) |
| BTMPL = USR |
| BTMPH = #0x030 |
| } |
| { |
| if (p0) BTMPL = or(BTMPL,BTMPH) |
| } |
| { |
| USR = BTMPL |
| } |
| { |
| p0 = dfcmp.eq(A,A) |
| jumpr r31 |
| } |
| .falign |
| .Lmul_ovf_unf: |
| { |
| A = convert_d2df(PP_HH) |
| ATMP = abs(PP_HH) // take absolute value |
| EXP1 = add(EXP0,#-BIAS-58) |
| } |
| { |
| AH += asl(EXP1,#HI_MANTBITS) |
| EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS) |
| BTMPL = ##0x7FEFFFFF |
| } |
| { |
| EXP1 += add(EXP0,##-BIAS-58) |
| //BTMPH = add(clb(ATMP),#-2) |
| BTMPH = #0 |
| } |
| { |
| p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow |
| if (p0.new) jump:nt .Lmul_ovf |
| } |
| { |
| p0 = cmp.gt(EXP1,#0) |
| if (p0.new) jump:nt .Lpossible_unf |
| BTMPH = sub(EXP0,BTMPH) |
| TMP = #63 // max amount to shift |
| } |
| // Underflow |
| // |
| // PP_HH has the partial product with sticky LSB. |
| // PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts |
| // PP_HH can have a minimum of 0x1000_0000_0000_0000 or so |
| // The exponent of PP_HH is in EXP1, which is non-positive (0 or negative) |
| // That's the exponent that happens after the normalization |
| // |
| // EXP0 has the exponent that, when added to the normalized value, is out of range. |
| // |
| // Strategy: |
| // |
| // * Shift down bits, with sticky bit, such that the bits are aligned according |
| // to the LZ count and appropriate exponent, but not all the way to mantissa |
| // field, keep around the last few bits. |
| // * Put a 1 near the MSB |
| // * Check the LSBs for inexact; if inexact also set underflow |
| // * Convert [u]d2df -- will correctly round according to rounding mode |
| // * Replace exponent field with zero |
| |
| { |
| BTMPL = #0 // offset for extract |
| BTMPH = sub(#FUDGE,BTMPH) // amount to right shift |
| } |
| { |
| p3 = cmp.gt(PP_HH_H,#-1) // is it positive? |
| BTMPH = min(BTMPH,TMP) // Don't shift more than 63 |
| PP_HH = ATMP |
| } |
| { |
| TMP = USR |
| PP_LL = extractu(PP_HH,BTMP) |
| } |
| { |
| PP_HH = asr(PP_HH,BTMPH) |
| BTMPL = #0x0030 // underflow flag |
| AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS) |
| } |
| { |
| p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros? |
| if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit |
| PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction |
| } |
| { |
| PP_LL = neg(PP_HH) |
| p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear? |
| if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow |
| } |
| { |
| if (!p3) PP_HH = PP_LL |
| USR = TMP |
| } |
| { |
| A = convert_d2df(PP_HH) // Do rounding |
| p0 = dfcmp.eq(A,A) // realize exception |
| } |
| { |
| AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent |
| jumpr r31 |
| } |
| .falign |
| .Lmul_ovf: |
| // We get either max finite value or infinity. Either way, overflow+inexact |
| { |
| TMP = USR |
| ATMP = combine(##0x7fefffff,#-1) // positive max finite |
| A = PP_HH |
| } |
| { |
| PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits |
| TMP = or(TMP,#0x28) // inexact + overflow |
| BTMP = combine(##0x7ff00000,#0) // positive infinity |
| } |
| { |
| USR = TMP |
| PP_LL_L ^= lsr(AH,#31) // Does sign match rounding? |
| TMP = PP_LL_L // unmodified rounding mode |
| } |
| { |
| p0 = !cmp.eq(TMP,#1) // If not round-to-zero and |
| p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way, |
| if (p0.new) ATMP = BTMP // we should get infinity |
| p0 = dfcmp.eq(A,A) // Realize FP exception if enabled |
| } |
| { |
| A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign |
| jumpr r31 |
| } |
| |
| .Lmul_abnormal: |
| { |
| ATMP = extractu(A,#63,#0) // strip off sign |
| BTMP = extractu(B,#63,#0) // strip off sign |
| } |
| { |
| p3 = cmp.gtu(ATMP,BTMP) |
| if (!p3.new) A = B // sort values |
| if (!p3.new) B = A // sort values |
| } |
| { |
| // Any NaN --> NaN, possibly raise invalid if sNaN |
| p0 = dfclass(A,#0x0f) // A not NaN? |
| if (!p0.new) jump:nt .Linvalid_nan |
| if (!p3) ATMP = BTMP |
| if (!p3) BTMP = ATMP |
| } |
| { |
| // Infinity * nonzero number is infinity |
| p1 = dfclass(A,#0x08) // A is infinity |
| p1 = dfclass(B,#0x0e) // B is nonzero |
| } |
| { |
| // Infinity * zero --> NaN, raise invalid |
| // Other zeros return zero |
| p0 = dfclass(A,#0x08) // A is infinity |
| p0 = dfclass(B,#0x01) // B is zero |
| } |
| { |
| if (p1) jump .Ltrue_inf |
| p2 = dfclass(B,#0x01) |
| } |
| { |
| if (p0) jump .Linvalid_zeroinf |
| if (p2) jump .Ltrue_zero // so return zero |
| TMP = ##0x7c000000 |
| } |
| // We are left with a normal or subnormal times a subnormal. A > B |
| // If A and B are both very small (exp(a) < BIAS-MANTBITS), |
| // we go to a single sticky bit, which we can round easily. |
| // If A and B might multiply to something bigger, decrease A exponent and increase |
| // B exponent and try again |
| { |
| p0 = bitsclr(AH,TMP) |
| if (p0.new) jump:nt .Lmul_tiny |
| } |
| { |
| TMP = cl0(BTMP) |
| } |
| { |
| TMP = add(TMP,#-EXPBITS) |
| } |
| { |
| BTMP = asl(BTMP,TMP) |
| } |
| { |
| B = insert(BTMP,#63,#0) |
| AH -= asl(TMP,#HI_MANTBITS) |
| } |
| jump __hexagon_muldf3 |
| .Lmul_tiny: |
| { |
| TMP = USR |
| A = xor(A,B) // get sign bit |
| } |
| { |
| TMP = or(TMP,#0x30) // Inexact + Underflow |
| A = insert(ONE,#63,#0) // put in rounded up value |
| BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode |
| } |
| { |
| USR = TMP |
| p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf? |
| if (!p0.new) AL = #0 // If not, zero |
| BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB |
| } |
| { |
| p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf |
| if (!p0.new) AL = #0 // don't go to zero |
| jumpr r31 |
| } |
| .Linvalid_zeroinf: |
| { |
| TMP = USR |
| } |
| { |
| A = #-1 |
| TMP = or(TMP,#2) |
| } |
| { |
| USR = TMP |
| } |
| { |
| p0 = dfcmp.uo(A,A) // force exception if enabled |
| jumpr r31 |
| } |
| .Linvalid_nan: |
| { |
| p0 = dfclass(B,#0x0f) // if B is not NaN |
| TMP = convert_df2sf(A) // will generate invalid if sNaN |
| if (p0.new) B = A // make it whatever A is |
| } |
| { |
| BL = convert_df2sf(B) // will generate invalid if sNaN |
| A = #-1 |
| jumpr r31 |
| } |
| .falign |
| .Ltrue_zero: |
| { |
| A = B |
| B = A |
| } |
| .Ltrue_inf: |
| { |
| BH = extract(BH,#1,#31) |
| } |
| { |
| AH ^= asl(BH,#31) |
| jumpr r31 |
| } |
| END(__hexagon_muldf3) |
| |
| #undef ATMP |
| #undef ATMPL |
| #undef ATMPH |
| #undef BTMP |
| #undef BTMPL |
| #undef BTMPH |