Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 1 | //===----------------------Hexagon builtin routine ------------------------===// |
| 2 | // |
Chandler Carruth | 9d304d2 | 2019-01-19 10:56:40 +0000 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
Petr Hosek | 5ba5011 | 2019-04-28 22:47:49 +0000 | [diff] [blame] | 9 | // Double Precision square root |
Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 10 | |
| 11 | #define EXP r28 |
| 12 | |
| 13 | #define A r1:0 |
| 14 | #define AH r1 |
| 15 | #define AL r0 |
| 16 | |
| 17 | #define SFSH r3:2 |
| 18 | #define SF_S r3 |
| 19 | #define SF_H r2 |
| 20 | |
| 21 | #define SFHALF_SONE r5:4 |
| 22 | #define S_ONE r4 |
| 23 | #define SFHALF r5 |
| 24 | #define SF_D r6 |
| 25 | #define SF_E r7 |
| 26 | #define RECIPEST r8 |
| 27 | #define SFRAD r9 |
| 28 | |
| 29 | #define FRACRAD r11:10 |
| 30 | #define FRACRADH r11 |
| 31 | #define FRACRADL r10 |
| 32 | |
| 33 | #define ROOT r13:12 |
| 34 | #define ROOTHI r13 |
| 35 | #define ROOTLO r12 |
| 36 | |
| 37 | #define PROD r15:14 |
| 38 | #define PRODHI r15 |
| 39 | #define PRODLO r14 |
| 40 | |
| 41 | #define P_TMP p0 |
| 42 | #define P_EXP1 p1 |
| 43 | #define NORMAL p2 |
| 44 | |
| 45 | #define SF_EXPBITS 8 |
| 46 | #define SF_MANTBITS 23 |
| 47 | |
| 48 | #define DF_EXPBITS 11 |
| 49 | #define DF_MANTBITS 52 |
| 50 | |
| 51 | #define DF_BIAS 0x3ff |
| 52 | |
| 53 | #define DFCLASS_ZERO 0x01 |
| 54 | #define DFCLASS_NORMAL 0x02 |
| 55 | #define DFCLASS_DENORMAL 0x02 |
| 56 | #define DFCLASS_INFINITE 0x08 |
| 57 | #define DFCLASS_NAN 0x10 |
| 58 | |
| 59 | #define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG; .type __qdsp_##TAG,@function |
| 60 | #define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG; .type __hexagon_fast_##TAG,@function |
| 61 | #define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG; .type __hexagon_fast2_##TAG,@function |
| 62 | #define END(TAG) .size TAG,.-TAG |
| 63 | |
| 64 | .text |
| 65 | .global __hexagon_sqrtdf2 |
| 66 | .type __hexagon_sqrtdf2,@function |
| 67 | .global __hexagon_sqrt |
| 68 | .type __hexagon_sqrt,@function |
| 69 | Q6_ALIAS(sqrtdf2) |
| 70 | Q6_ALIAS(sqrt) |
| 71 | FAST_ALIAS(sqrtdf2) |
| 72 | FAST_ALIAS(sqrt) |
| 73 | FAST2_ALIAS(sqrtdf2) |
| 74 | FAST2_ALIAS(sqrt) |
| 75 | .type sqrt,@function |
| 76 | .p2align 5 |
| 77 | __hexagon_sqrtdf2: |
| 78 | __hexagon_sqrt: |
| 79 | { |
| 80 | PROD = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) |
| 81 | EXP = extractu(AH,#DF_EXPBITS,#DF_MANTBITS-32) |
| 82 | SFHALF_SONE = combine(##0x3f000004,#1) |
| 83 | } |
| 84 | { |
| 85 | NORMAL = dfclass(A,#DFCLASS_NORMAL) // Is it normal |
| 86 | NORMAL = cmp.gt(AH,#-1) // and positive? |
| 87 | if (!NORMAL.new) jump:nt .Lsqrt_abnormal |
| 88 | SFRAD = or(SFHALF,PRODLO) |
| 89 | } |
| 90 | #undef NORMAL |
| 91 | .Ldenormal_restart: |
| 92 | { |
| 93 | FRACRAD = A |
| 94 | SF_E,P_TMP = sfinvsqrta(SFRAD) |
| 95 | SFHALF = and(SFHALF,#-16) |
| 96 | SFSH = #0 |
| 97 | } |
| 98 | #undef A |
| 99 | #undef AH |
| 100 | #undef AL |
| 101 | #define ERROR r1:0 |
| 102 | #define ERRORHI r1 |
| 103 | #define ERRORLO r0 |
| 104 | // SF_E : reciprocal square root |
| 105 | // SF_H : half rsqrt |
| 106 | // sf_S : square root |
| 107 | // SF_D : error term |
| 108 | // SFHALF: 0.5 |
| 109 | { |
| 110 | SF_S += sfmpy(SF_E,SFRAD):lib // s0: root |
| 111 | SF_H += sfmpy(SF_E,SFHALF):lib // h0: 0.5*y0. Could also decrement exponent... |
| 112 | SF_D = SFHALF |
| 113 | #undef SFRAD |
| 114 | #define SHIFTAMT r9 |
| 115 | SHIFTAMT = and(EXP,#1) |
| 116 | } |
| 117 | { |
| 118 | SF_D -= sfmpy(SF_S,SF_H):lib // d0: 0.5-H*S = 0.5-0.5*~1 |
| 119 | FRACRADH = insert(S_ONE,#DF_EXPBITS+1,#DF_MANTBITS-32) // replace upper bits with hidden |
| 120 | P_EXP1 = cmp.gtu(SHIFTAMT,#0) |
| 121 | } |
| 122 | { |
| 123 | SF_S += sfmpy(SF_S,SF_D):lib // s1: refine sqrt |
| 124 | SF_H += sfmpy(SF_H,SF_D):lib // h1: refine half-recip |
| 125 | SF_D = SFHALF |
| 126 | SHIFTAMT = mux(P_EXP1,#8,#9) |
| 127 | } |
| 128 | { |
| 129 | SF_D -= sfmpy(SF_S,SF_H):lib // d1: error term |
| 130 | FRACRAD = asl(FRACRAD,SHIFTAMT) // Move fracrad bits to right place |
| 131 | SHIFTAMT = mux(P_EXP1,#3,#2) |
| 132 | } |
| 133 | { |
| 134 | SF_H += sfmpy(SF_H,SF_D):lib // d2: rsqrt |
| 135 | // cool trick: half of 1/sqrt(x) has same mantissa as 1/sqrt(x). |
| 136 | PROD = asl(FRACRAD,SHIFTAMT) // fracrad<<(2+exp1) |
| 137 | } |
| 138 | { |
| 139 | SF_H = and(SF_H,##0x007fffff) |
| 140 | } |
| 141 | { |
| 142 | SF_H = add(SF_H,##0x00800000 - 3) |
| 143 | SHIFTAMT = mux(P_EXP1,#7,#8) |
| 144 | } |
| 145 | { |
| 146 | RECIPEST = asl(SF_H,SHIFTAMT) |
| 147 | SHIFTAMT = mux(P_EXP1,#15-(1+1),#15-(1+0)) |
| 148 | } |
| 149 | { |
| 150 | ROOT = mpyu(RECIPEST,PRODHI) // root = mpyu_full(recipest,hi(fracrad<<(2+exp1))) |
| 151 | } |
| 152 | |
| 153 | #undef SFSH // r3:2 |
| 154 | #undef SF_H // r2 |
| 155 | #undef SF_S // r3 |
| 156 | #undef S_ONE // r4 |
| 157 | #undef SFHALF // r5 |
| 158 | #undef SFHALF_SONE // r5:4 |
| 159 | #undef SF_D // r6 |
| 160 | #undef SF_E // r7 |
| 161 | |
| 162 | #define HL r3:2 |
| 163 | #define LL r5:4 |
| 164 | #define HH r7:6 |
| 165 | |
| 166 | #undef P_EXP1 |
| 167 | #define P_CARRY0 p1 |
| 168 | #define P_CARRY1 p2 |
| 169 | #define P_CARRY2 p3 |
| 170 | |
Petr Hosek | 5ba5011 | 2019-04-28 22:47:49 +0000 | [diff] [blame] | 171 | // Iteration 0 |
| 172 | // Maybe we can save a cycle by starting with ERROR=asl(fracrad), then as we multiply |
| 173 | // We can shift and subtract instead of shift and add? |
Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 174 | { |
| 175 | ERROR = asl(FRACRAD,#15) |
| 176 | PROD = mpyu(ROOTHI,ROOTHI) |
| 177 | P_CARRY0 = cmp.eq(r0,r0) |
| 178 | } |
| 179 | { |
| 180 | ERROR -= asl(PROD,#15) |
| 181 | PROD = mpyu(ROOTHI,ROOTLO) |
| 182 | P_CARRY1 = cmp.eq(r0,r0) |
| 183 | } |
| 184 | { |
| 185 | ERROR -= lsr(PROD,#16) |
| 186 | P_CARRY2 = cmp.eq(r0,r0) |
| 187 | } |
| 188 | { |
| 189 | ERROR = mpyu(ERRORHI,RECIPEST) |
| 190 | } |
| 191 | { |
| 192 | ROOT += lsr(ERROR,SHIFTAMT) |
| 193 | SHIFTAMT = add(SHIFTAMT,#16) |
| 194 | ERROR = asl(FRACRAD,#31) // for next iter |
| 195 | } |
Petr Hosek | 5ba5011 | 2019-04-28 22:47:49 +0000 | [diff] [blame] | 196 | // Iteration 1 |
Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 197 | { |
| 198 | PROD = mpyu(ROOTHI,ROOTHI) |
| 199 | ERROR -= mpyu(ROOTHI,ROOTLO) // amount is 31, no shift needed |
| 200 | } |
| 201 | { |
| 202 | ERROR -= asl(PROD,#31) |
| 203 | PROD = mpyu(ROOTLO,ROOTLO) |
| 204 | } |
| 205 | { |
| 206 | ERROR -= lsr(PROD,#33) |
| 207 | } |
| 208 | { |
| 209 | ERROR = mpyu(ERRORHI,RECIPEST) |
| 210 | } |
| 211 | { |
| 212 | ROOT += lsr(ERROR,SHIFTAMT) |
| 213 | SHIFTAMT = add(SHIFTAMT,#16) |
| 214 | ERROR = asl(FRACRAD,#47) // for next iter |
| 215 | } |
Petr Hosek | 5ba5011 | 2019-04-28 22:47:49 +0000 | [diff] [blame] | 216 | // Iteration 2 |
Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 217 | { |
| 218 | PROD = mpyu(ROOTHI,ROOTHI) |
| 219 | } |
| 220 | { |
| 221 | ERROR -= asl(PROD,#47) |
| 222 | PROD = mpyu(ROOTHI,ROOTLO) |
| 223 | } |
| 224 | { |
| 225 | ERROR -= asl(PROD,#16) // bidir shr 31-47 |
| 226 | PROD = mpyu(ROOTLO,ROOTLO) |
| 227 | } |
| 228 | { |
| 229 | ERROR -= lsr(PROD,#17) // 64-47 |
| 230 | } |
| 231 | { |
| 232 | ERROR = mpyu(ERRORHI,RECIPEST) |
| 233 | } |
| 234 | { |
| 235 | ROOT += lsr(ERROR,SHIFTAMT) |
| 236 | } |
| 237 | #undef ERROR |
| 238 | #undef PROD |
| 239 | #undef PRODHI |
| 240 | #undef PRODLO |
| 241 | #define REM_HI r15:14 |
| 242 | #define REM_HI_HI r15 |
| 243 | #define REM_LO r1:0 |
| 244 | #undef RECIPEST |
| 245 | #undef SHIFTAMT |
| 246 | #define TWOROOT_LO r9:8 |
Petr Hosek | 5ba5011 | 2019-04-28 22:47:49 +0000 | [diff] [blame] | 247 | // Adjust Root |
Sid Manning | d880b3b | 2018-05-09 14:44:54 +0000 | [diff] [blame] | 248 | { |
| 249 | HL = mpyu(ROOTHI,ROOTLO) |
| 250 | LL = mpyu(ROOTLO,ROOTLO) |
| 251 | REM_HI = #0 |
| 252 | REM_LO = #0 |
| 253 | } |
| 254 | { |
| 255 | HL += lsr(LL,#33) |
| 256 | LL += asl(HL,#33) |
| 257 | P_CARRY0 = cmp.eq(r0,r0) |
| 258 | } |
| 259 | { |
| 260 | HH = mpyu(ROOTHI,ROOTHI) |
| 261 | REM_LO = sub(REM_LO,LL,P_CARRY0):carry |
| 262 | TWOROOT_LO = #1 |
| 263 | } |
| 264 | { |
| 265 | HH += lsr(HL,#31) |
| 266 | TWOROOT_LO += asl(ROOT,#1) |
| 267 | } |
| 268 | #undef HL |
| 269 | #undef LL |
| 270 | #define REM_HI_TMP r3:2 |
| 271 | #define REM_HI_TMP_HI r3 |
| 272 | #define REM_LO_TMP r5:4 |
| 273 | { |
| 274 | REM_HI = sub(FRACRAD,HH,P_CARRY0):carry |
| 275 | REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY1):carry |
| 276 | #undef FRACRAD |
| 277 | #undef HH |
| 278 | #define ZERO r11:10 |
| 279 | #define ONE r7:6 |
| 280 | ONE = #1 |
| 281 | ZERO = #0 |
| 282 | } |
| 283 | { |
| 284 | REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY1):carry |
| 285 | ONE = add(ROOT,ONE) |
| 286 | EXP = add(EXP,#-DF_BIAS) // subtract bias --> signed exp |
| 287 | } |
| 288 | { |
| 289 | // If carry set, no borrow: result was still positive |
| 290 | if (P_CARRY1) ROOT = ONE |
| 291 | if (P_CARRY1) REM_LO = REM_LO_TMP |
| 292 | if (P_CARRY1) REM_HI = REM_HI_TMP |
| 293 | } |
| 294 | { |
| 295 | REM_LO_TMP = sub(REM_LO,TWOROOT_LO,P_CARRY2):carry |
| 296 | ONE = #1 |
| 297 | EXP = asr(EXP,#1) // divide signed exp by 2 |
| 298 | } |
| 299 | { |
| 300 | REM_HI_TMP = sub(REM_HI,ZERO,P_CARRY2):carry |
| 301 | ONE = add(ROOT,ONE) |
| 302 | } |
| 303 | { |
| 304 | if (P_CARRY2) ROOT = ONE |
| 305 | if (P_CARRY2) REM_LO = REM_LO_TMP |
| 306 | // since tworoot <= 2^32, remhi must be zero |
| 307 | #undef REM_HI_TMP |
| 308 | #undef REM_HI_TMP_HI |
| 309 | #define S_ONE r2 |
| 310 | #define ADJ r3 |
| 311 | S_ONE = #1 |
| 312 | } |
| 313 | { |
| 314 | P_TMP = cmp.eq(REM_LO,ZERO) // is the low part zero |
| 315 | if (!P_TMP.new) ROOTLO = or(ROOTLO,S_ONE) // if so, it's exact... hopefully |
| 316 | ADJ = cl0(ROOT) |
| 317 | EXP = add(EXP,#-63) |
| 318 | } |
| 319 | #undef REM_LO |
| 320 | #define RET r1:0 |
| 321 | #define RETHI r1 |
| 322 | { |
| 323 | RET = convert_ud2df(ROOT) // set up mantissa, maybe set inexact flag |
| 324 | EXP = add(EXP,ADJ) // add back bias |
| 325 | } |
| 326 | { |
| 327 | RETHI += asl(EXP,#DF_MANTBITS-32) // add exponent adjust |
| 328 | jumpr r31 |
| 329 | } |
| 330 | #undef REM_LO_TMP |
| 331 | #undef REM_HI_TMP |
| 332 | #undef REM_HI_TMP_HI |
| 333 | #undef REM_LO |
| 334 | #undef REM_HI |
| 335 | #undef TWOROOT_LO |
| 336 | |
| 337 | #undef RET |
| 338 | #define A r1:0 |
| 339 | #define AH r1 |
| 340 | #define AL r1 |
| 341 | #undef S_ONE |
| 342 | #define TMP r3:2 |
| 343 | #define TMPHI r3 |
| 344 | #define TMPLO r2 |
| 345 | #undef P_CARRY0 |
| 346 | #define P_NEG p1 |
| 347 | |
| 348 | |
| 349 | #define SFHALF r5 |
| 350 | #define SFRAD r9 |
| 351 | .Lsqrt_abnormal: |
| 352 | { |
| 353 | P_TMP = dfclass(A,#DFCLASS_ZERO) // zero? |
| 354 | if (P_TMP.new) jumpr:t r31 |
| 355 | } |
| 356 | { |
| 357 | P_TMP = dfclass(A,#DFCLASS_NAN) |
| 358 | if (P_TMP.new) jump:nt .Lsqrt_nan |
| 359 | } |
| 360 | { |
| 361 | P_TMP = cmp.gt(AH,#-1) |
| 362 | if (!P_TMP.new) jump:nt .Lsqrt_invalid_neg |
| 363 | if (!P_TMP.new) EXP = ##0x7F800001 // sNaN |
| 364 | } |
| 365 | { |
| 366 | P_TMP = dfclass(A,#DFCLASS_INFINITE) |
| 367 | if (P_TMP.new) jumpr:nt r31 |
| 368 | } |
| 369 | // If we got here, we're denormal |
| 370 | // prepare to restart |
| 371 | { |
| 372 | A = extractu(A,#DF_MANTBITS,#0) // Extract mantissa |
| 373 | } |
| 374 | { |
| 375 | EXP = add(clb(A),#-DF_EXPBITS) // how much to normalize? |
| 376 | } |
| 377 | { |
| 378 | A = asl(A,EXP) // Shift mantissa |
| 379 | EXP = sub(#1,EXP) // Form exponent |
| 380 | } |
| 381 | { |
| 382 | AH = insert(EXP,#1,#DF_MANTBITS-32) // insert lsb of exponent |
| 383 | } |
| 384 | { |
| 385 | TMP = extractu(A,#SF_MANTBITS+1,#DF_MANTBITS-SF_MANTBITS) // get sf value (mant+exp1) |
| 386 | SFHALF = ##0x3f000004 // form half constant |
| 387 | } |
| 388 | { |
| 389 | SFRAD = or(SFHALF,TMPLO) // form sf value |
| 390 | SFHALF = and(SFHALF,#-16) |
| 391 | jump .Ldenormal_restart // restart |
| 392 | } |
| 393 | .Lsqrt_nan: |
| 394 | { |
| 395 | EXP = convert_df2sf(A) // if sNaN, get invalid |
| 396 | A = #-1 // qNaN |
| 397 | jumpr r31 |
| 398 | } |
| 399 | .Lsqrt_invalid_neg: |
| 400 | { |
| 401 | A = convert_sf2df(EXP) // Invalid,NaNval |
| 402 | jumpr r31 |
| 403 | } |
| 404 | END(__hexagon_sqrt) |
| 405 | END(__hexagon_sqrtdf2) |