root / target-arm / nwfpe / softfloat.c @ 157777ef
History | View | Annotate | Download (114.7 kB)
1 |
/*
|
---|---|
2 |
===============================================================================
|
3 |
|
4 |
This C source file is part of the SoftFloat IEC/IEEE Floating-point
|
5 |
Arithmetic Package, Release 2.
|
6 |
|
7 |
Written by John R. Hauser. This work was made possible in part by the
|
8 |
International Computer Science Institute, located at Suite 600, 1947 Center
|
9 |
Street, Berkeley, California 94704. Funding was partially provided by the
|
10 |
National Science Foundation under grant MIP-9311980. The original version
|
11 |
of this code was written as part of a project to build a fixed-point vector
|
12 |
processor in collaboration with the University of California at Berkeley,
|
13 |
overseen by Profs. Nelson Morgan and John Wawrzynek. More information
|
14 |
is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
|
15 |
arithmetic/softfloat.html'.
|
16 |
|
17 |
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
|
18 |
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
|
19 |
TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
|
20 |
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
|
21 |
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
|
22 |
|
23 |
Derivative works are acceptable, even for commercial purposes, so long as
|
24 |
(1) they include prominent notice that the work is derivative, and (2) they
|
25 |
include prominent notice akin to these three paragraphs for those parts of
|
26 |
this code that are retained.
|
27 |
|
28 |
===============================================================================
|
29 |
*/
|
30 |
|
31 |
#include "fpa11.h" |
32 |
#include "milieu.h" |
33 |
#include "softfloat.h" |
34 |
|
35 |
/*
|
36 |
-------------------------------------------------------------------------------
|
37 |
Floating-point rounding mode, extended double-precision rounding precision,
|
38 |
and exception flags.
|
39 |
-------------------------------------------------------------------------------
|
40 |
*/
|
41 |
int8 float_rounding_mode = float_round_nearest_even; |
42 |
int8 floatx80_rounding_precision = 80;
|
43 |
int8 float_exception_flags; |
44 |
|
45 |
/*
|
46 |
-------------------------------------------------------------------------------
|
47 |
Primitive arithmetic functions, including multi-word arithmetic, and
|
48 |
division and square root approximations. (Can be specialized to target if
|
49 |
desired.)
|
50 |
-------------------------------------------------------------------------------
|
51 |
*/
|
52 |
#include "softfloat-macros" |
53 |
|
54 |
/*
|
55 |
-------------------------------------------------------------------------------
|
56 |
Functions and definitions to determine: (1) whether tininess for underflow
|
57 |
is detected before or after rounding by default, (2) what (if anything)
|
58 |
happens when exceptions are raised, (3) how signaling NaNs are distinguished
|
59 |
from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
|
60 |
are propagated from function inputs to output. These details are target-
|
61 |
specific.
|
62 |
-------------------------------------------------------------------------------
|
63 |
*/
|
64 |
#include "softfloat-specialize" |
65 |
|
66 |
/*
|
67 |
-------------------------------------------------------------------------------
|
68 |
Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
|
69 |
and 7, and returns the properly rounded 32-bit integer corresponding to the
|
70 |
input. If `zSign' is nonzero, the input is negated before being converted
|
71 |
to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
|
72 |
input is simply rounded to an integer, with the inexact exception raised if
|
73 |
the input cannot be represented exactly as an integer. If the fixed-point
|
74 |
input is too large, however, the invalid exception is raised and the largest
|
75 |
positive or negative integer is returned.
|
76 |
-------------------------------------------------------------------------------
|
77 |
*/
|
78 |
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
|
79 |
{ |
80 |
int8 roundingMode; |
81 |
flag roundNearestEven; |
82 |
int8 roundIncrement, roundBits; |
83 |
int32 z; |
84 |
|
85 |
roundingMode = float_rounding_mode; |
86 |
roundNearestEven = ( roundingMode == float_round_nearest_even ); |
87 |
roundIncrement = 0x40;
|
88 |
if ( ! roundNearestEven ) {
|
89 |
if ( roundingMode == float_round_to_zero ) {
|
90 |
roundIncrement = 0;
|
91 |
} |
92 |
else {
|
93 |
roundIncrement = 0x7F;
|
94 |
if ( zSign ) {
|
95 |
if ( roundingMode == float_round_up ) roundIncrement = 0; |
96 |
} |
97 |
else {
|
98 |
if ( roundingMode == float_round_down ) roundIncrement = 0; |
99 |
} |
100 |
} |
101 |
} |
102 |
roundBits = absZ & 0x7F;
|
103 |
absZ = ( absZ + roundIncrement )>>7;
|
104 |
absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); |
105 |
z = absZ; |
106 |
if ( zSign ) z = - z;
|
107 |
if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { |
108 |
float_exception_flags |= float_flag_invalid; |
109 |
return zSign ? 0x80000000 : 0x7FFFFFFF; |
110 |
} |
111 |
if ( roundBits ) float_exception_flags |= float_flag_inexact;
|
112 |
return z;
|
113 |
|
114 |
} |
115 |
|
116 |
/*
|
117 |
-------------------------------------------------------------------------------
|
118 |
Returns the fraction bits of the single-precision floating-point value `a'.
|
119 |
-------------------------------------------------------------------------------
|
120 |
*/
|
121 |
INLINE bits32 extractFloat32Frac( float32 a ) |
122 |
{ |
123 |
|
124 |
return a & 0x007FFFFF; |
125 |
|
126 |
} |
127 |
|
128 |
/*
|
129 |
-------------------------------------------------------------------------------
|
130 |
Returns the exponent bits of the single-precision floating-point value `a'.
|
131 |
-------------------------------------------------------------------------------
|
132 |
*/
|
133 |
INLINE int16 extractFloat32Exp( float32 a ) |
134 |
{ |
135 |
|
136 |
return ( a>>23 ) & 0xFF; |
137 |
|
138 |
} |
139 |
|
140 |
/*
|
141 |
-------------------------------------------------------------------------------
|
142 |
Returns the sign bit of the single-precision floating-point value `a'.
|
143 |
-------------------------------------------------------------------------------
|
144 |
*/
|
145 |
INLINE flag extractFloat32Sign( float32 a ) |
146 |
{ |
147 |
|
148 |
return a>>31; |
149 |
|
150 |
} |
151 |
|
152 |
/*
|
153 |
-------------------------------------------------------------------------------
|
154 |
Normalizes the subnormal single-precision floating-point value represented
|
155 |
by the denormalized significand `aSig'. The normalized exponent and
|
156 |
significand are stored at the locations pointed to by `zExpPtr' and
|
157 |
`zSigPtr', respectively.
|
158 |
-------------------------------------------------------------------------------
|
159 |
*/
|
160 |
static void |
161 |
normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) |
162 |
{ |
163 |
int8 shiftCount; |
164 |
|
165 |
shiftCount = countLeadingZeros32( aSig ) - 8;
|
166 |
*zSigPtr = aSig<<shiftCount; |
167 |
*zExpPtr = 1 - shiftCount;
|
168 |
|
169 |
} |
170 |
|
171 |
/*
|
172 |
-------------------------------------------------------------------------------
|
173 |
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
|
174 |
single-precision floating-point value, returning the result. After being
|
175 |
shifted into the proper positions, the three fields are simply added
|
176 |
together to form the result. This means that any integer portion of `zSig'
|
177 |
will be added into the exponent. Since a properly normalized significand
|
178 |
will have an integer portion equal to 1, the `zExp' input should be 1 less
|
179 |
than the desired result exponent whenever `zSig' is a complete, normalized
|
180 |
significand.
|
181 |
-------------------------------------------------------------------------------
|
182 |
*/
|
183 |
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) |
184 |
{ |
185 |
return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; |
186 |
} |
187 |
|
188 |
/*
|
189 |
-------------------------------------------------------------------------------
|
190 |
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
|
191 |
and significand `zSig', and returns the proper single-precision floating-
|
192 |
point value corresponding to the abstract input. Ordinarily, the abstract
|
193 |
value is simply rounded and packed into the single-precision format, with
|
194 |
the inexact exception raised if the abstract input cannot be represented
|
195 |
exactly. If the abstract value is too large, however, the overflow and
|
196 |
inexact exceptions are raised and an infinity or maximal finite value is
|
197 |
returned. If the abstract value is too small, the input value is rounded to
|
198 |
a subnormal number, and the underflow and inexact exceptions are raised if
|
199 |
the abstract input cannot be represented exactly as a subnormal single-
|
200 |
precision floating-point number.
|
201 |
The input significand `zSig' has its binary point between bits 30
|
202 |
and 29, which is 7 bits to the left of the usual location. This shifted
|
203 |
significand must be normalized or smaller. If `zSig' is not normalized,
|
204 |
`zExp' must be 0; in that case, the result returned is a subnormal number,
|
205 |
and it must not require rounding. In the usual case that `zSig' is
|
206 |
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
|
207 |
The handling of underflow and overflow follows the IEC/IEEE Standard for
|
208 |
Binary Floating-point Arithmetic.
|
209 |
-------------------------------------------------------------------------------
|
210 |
*/
|
211 |
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
|
212 |
{ |
213 |
int8 roundingMode; |
214 |
flag roundNearestEven; |
215 |
int8 roundIncrement, roundBits; |
216 |
flag isTiny; |
217 |
|
218 |
roundingMode = float_rounding_mode; |
219 |
roundNearestEven = ( roundingMode == float_round_nearest_even ); |
220 |
roundIncrement = 0x40;
|
221 |
if ( ! roundNearestEven ) {
|
222 |
if ( roundingMode == float_round_to_zero ) {
|
223 |
roundIncrement = 0;
|
224 |
} |
225 |
else {
|
226 |
roundIncrement = 0x7F;
|
227 |
if ( zSign ) {
|
228 |
if ( roundingMode == float_round_up ) roundIncrement = 0; |
229 |
} |
230 |
else {
|
231 |
if ( roundingMode == float_round_down ) roundIncrement = 0; |
232 |
} |
233 |
} |
234 |
} |
235 |
roundBits = zSig & 0x7F;
|
236 |
if ( 0xFD <= (bits16) zExp ) { |
237 |
if ( ( 0xFD < zExp ) |
238 |
|| ( ( zExp == 0xFD )
|
239 |
&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
|
240 |
) { |
241 |
float_raise( float_flag_overflow | float_flag_inexact ); |
242 |
return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); |
243 |
} |
244 |
if ( zExp < 0 ) { |
245 |
isTiny = |
246 |
( float_detect_tininess == float_tininess_before_rounding ) |
247 |
|| ( zExp < -1 )
|
248 |
|| ( zSig + roundIncrement < 0x80000000 );
|
249 |
shift32RightJamming( zSig, - zExp, &zSig ); |
250 |
zExp = 0;
|
251 |
roundBits = zSig & 0x7F;
|
252 |
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
|
253 |
} |
254 |
} |
255 |
if ( roundBits ) float_exception_flags |= float_flag_inexact;
|
256 |
zSig = ( zSig + roundIncrement )>>7;
|
257 |
zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); |
258 |
if ( zSig == 0 ) zExp = 0; |
259 |
return packFloat32( zSign, zExp, zSig );
|
260 |
|
261 |
} |
262 |
|
263 |
/*
|
264 |
-------------------------------------------------------------------------------
|
265 |
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
|
266 |
and significand `zSig', and returns the proper single-precision floating-
|
267 |
point value corresponding to the abstract input. This routine is just like
|
268 |
`roundAndPackFloat32' except that `zSig' does not have to be normalized in
|
269 |
any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
|
270 |
point exponent.
|
271 |
-------------------------------------------------------------------------------
|
272 |
*/
|
273 |
static float32
|
274 |
normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) |
275 |
{ |
276 |
int8 shiftCount; |
277 |
|
278 |
shiftCount = countLeadingZeros32( zSig ) - 1;
|
279 |
return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
|
280 |
|
281 |
} |
282 |
|
283 |
/*
|
284 |
-------------------------------------------------------------------------------
|
285 |
Returns the fraction bits of the double-precision floating-point value `a'.
|
286 |
-------------------------------------------------------------------------------
|
287 |
*/
|
288 |
INLINE bits64 extractFloat64Frac( float64 a ) |
289 |
{ |
290 |
|
291 |
return a & LIT64( 0x000FFFFFFFFFFFFF ); |
292 |
|
293 |
} |
294 |
|
295 |
/*
|
296 |
-------------------------------------------------------------------------------
|
297 |
Returns the exponent bits of the double-precision floating-point value `a'.
|
298 |
-------------------------------------------------------------------------------
|
299 |
*/
|
300 |
INLINE int16 extractFloat64Exp( float64 a ) |
301 |
{ |
302 |
|
303 |
return ( a>>52 ) & 0x7FF; |
304 |
|
305 |
} |
306 |
|
307 |
/*
|
308 |
-------------------------------------------------------------------------------
|
309 |
Returns the sign bit of the double-precision floating-point value `a'.
|
310 |
-------------------------------------------------------------------------------
|
311 |
*/
|
312 |
INLINE flag extractFloat64Sign( float64 a ) |
313 |
{ |
314 |
|
315 |
return a>>63; |
316 |
|
317 |
} |
318 |
|
319 |
/*
|
320 |
-------------------------------------------------------------------------------
|
321 |
Normalizes the subnormal double-precision floating-point value represented
|
322 |
by the denormalized significand `aSig'. The normalized exponent and
|
323 |
significand are stored at the locations pointed to by `zExpPtr' and
|
324 |
`zSigPtr', respectively.
|
325 |
-------------------------------------------------------------------------------
|
326 |
*/
|
327 |
static void |
328 |
normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) |
329 |
{ |
330 |
int8 shiftCount; |
331 |
|
332 |
shiftCount = countLeadingZeros64( aSig ) - 11;
|
333 |
*zSigPtr = aSig<<shiftCount; |
334 |
*zExpPtr = 1 - shiftCount;
|
335 |
|
336 |
} |
337 |
|
338 |
/*
|
339 |
-------------------------------------------------------------------------------
|
340 |
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
|
341 |
double-precision floating-point value, returning the result. After being
|
342 |
shifted into the proper positions, the three fields are simply added
|
343 |
together to form the result. This means that any integer portion of `zSig'
|
344 |
will be added into the exponent. Since a properly normalized significand
|
345 |
will have an integer portion equal to 1, the `zExp' input should be 1 less
|
346 |
than the desired result exponent whenever `zSig' is a complete, normalized
|
347 |
significand.
|
348 |
-------------------------------------------------------------------------------
|
349 |
*/
|
350 |
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) |
351 |
{ |
352 |
|
353 |
return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig; |
354 |
|
355 |
} |
356 |
|
357 |
/*
|
358 |
-------------------------------------------------------------------------------
|
359 |
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
|
360 |
and significand `zSig', and returns the proper double-precision floating-
|
361 |
point value corresponding to the abstract input. Ordinarily, the abstract
|
362 |
value is simply rounded and packed into the double-precision format, with
|
363 |
the inexact exception raised if the abstract input cannot be represented
|
364 |
exactly. If the abstract value is too large, however, the overflow and
|
365 |
inexact exceptions are raised and an infinity or maximal finite value is
|
366 |
returned. If the abstract value is too small, the input value is rounded to
|
367 |
a subnormal number, and the underflow and inexact exceptions are raised if
|
368 |
the abstract input cannot be represented exactly as a subnormal double-
|
369 |
precision floating-point number.
|
370 |
The input significand `zSig' has its binary point between bits 62
|
371 |
and 61, which is 10 bits to the left of the usual location. This shifted
|
372 |
significand must be normalized or smaller. If `zSig' is not normalized,
|
373 |
`zExp' must be 0; in that case, the result returned is a subnormal number,
|
374 |
and it must not require rounding. In the usual case that `zSig' is
|
375 |
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
|
376 |
The handling of underflow and overflow follows the IEC/IEEE Standard for
|
377 |
Binary Floating-point Arithmetic.
|
378 |
-------------------------------------------------------------------------------
|
379 |
*/
|
380 |
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
|
381 |
{ |
382 |
int8 roundingMode; |
383 |
flag roundNearestEven; |
384 |
int16 roundIncrement, roundBits; |
385 |
flag isTiny; |
386 |
|
387 |
roundingMode = float_rounding_mode; |
388 |
roundNearestEven = ( roundingMode == float_round_nearest_even ); |
389 |
roundIncrement = 0x200;
|
390 |
if ( ! roundNearestEven ) {
|
391 |
if ( roundingMode == float_round_to_zero ) {
|
392 |
roundIncrement = 0;
|
393 |
} |
394 |
else {
|
395 |
roundIncrement = 0x3FF;
|
396 |
if ( zSign ) {
|
397 |
if ( roundingMode == float_round_up ) roundIncrement = 0; |
398 |
} |
399 |
else {
|
400 |
if ( roundingMode == float_round_down ) roundIncrement = 0; |
401 |
} |
402 |
} |
403 |
} |
404 |
roundBits = zSig & 0x3FF;
|
405 |
if ( 0x7FD <= (bits16) zExp ) { |
406 |
if ( ( 0x7FD < zExp ) |
407 |
|| ( ( zExp == 0x7FD )
|
408 |
&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
|
409 |
) { |
410 |
//register int lr = __builtin_return_address(0);
|
411 |
//printk("roundAndPackFloat64 called from 0x%08x\n",lr);
|
412 |
float_raise( float_flag_overflow | float_flag_inexact ); |
413 |
return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 ); |
414 |
} |
415 |
if ( zExp < 0 ) { |
416 |
isTiny = |
417 |
( float_detect_tininess == float_tininess_before_rounding ) |
418 |
|| ( zExp < -1 )
|
419 |
|| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
|
420 |
shift64RightJamming( zSig, - zExp, &zSig ); |
421 |
zExp = 0;
|
422 |
roundBits = zSig & 0x3FF;
|
423 |
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
|
424 |
} |
425 |
} |
426 |
if ( roundBits ) float_exception_flags |= float_flag_inexact;
|
427 |
zSig = ( zSig + roundIncrement )>>10;
|
428 |
zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); |
429 |
if ( zSig == 0 ) zExp = 0; |
430 |
return packFloat64( zSign, zExp, zSig );
|
431 |
|
432 |
} |
433 |
|
434 |
/*
|
435 |
-------------------------------------------------------------------------------
|
436 |
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
|
437 |
and significand `zSig', and returns the proper double-precision floating-
|
438 |
point value corresponding to the abstract input. This routine is just like
|
439 |
`roundAndPackFloat64' except that `zSig' does not have to be normalized in
|
440 |
any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
|
441 |
point exponent.
|
442 |
-------------------------------------------------------------------------------
|
443 |
*/
|
444 |
static float64
|
445 |
normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) |
446 |
{ |
447 |
int8 shiftCount; |
448 |
|
449 |
shiftCount = countLeadingZeros64( zSig ) - 1;
|
450 |
return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
|
451 |
|
452 |
} |
453 |
|
454 |
#ifdef FLOATX80
|
455 |
|
456 |
/*
|
457 |
-------------------------------------------------------------------------------
|
458 |
Returns the fraction bits of the extended double-precision floating-point
|
459 |
value `a'.
|
460 |
-------------------------------------------------------------------------------
|
461 |
*/
|
462 |
INLINE bits64 extractFloatx80Frac( floatx80 a ) |
463 |
{ |
464 |
|
465 |
return a.low;
|
466 |
|
467 |
} |
468 |
|
469 |
/*
|
470 |
-------------------------------------------------------------------------------
|
471 |
Returns the exponent bits of the extended double-precision floating-point
|
472 |
value `a'.
|
473 |
-------------------------------------------------------------------------------
|
474 |
*/
|
475 |
INLINE int32 extractFloatx80Exp( floatx80 a ) |
476 |
{ |
477 |
|
478 |
return a.high & 0x7FFF; |
479 |
|
480 |
} |
481 |
|
482 |
/*
|
483 |
-------------------------------------------------------------------------------
|
484 |
Returns the sign bit of the extended double-precision floating-point value
|
485 |
`a'.
|
486 |
-------------------------------------------------------------------------------
|
487 |
*/
|
488 |
INLINE flag extractFloatx80Sign( floatx80 a ) |
489 |
{ |
490 |
|
491 |
return a.high>>15; |
492 |
|
493 |
} |
494 |
|
495 |
/*
|
496 |
-------------------------------------------------------------------------------
|
497 |
Normalizes the subnormal extended double-precision floating-point value
|
498 |
represented by the denormalized significand `aSig'. The normalized exponent
|
499 |
and significand are stored at the locations pointed to by `zExpPtr' and
|
500 |
`zSigPtr', respectively.
|
501 |
-------------------------------------------------------------------------------
|
502 |
*/
|
503 |
static void |
504 |
normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) |
505 |
{ |
506 |
int8 shiftCount; |
507 |
|
508 |
shiftCount = countLeadingZeros64( aSig ); |
509 |
*zSigPtr = aSig<<shiftCount; |
510 |
*zExpPtr = 1 - shiftCount;
|
511 |
|
512 |
} |
513 |
|
514 |
/*
|
515 |
-------------------------------------------------------------------------------
|
516 |
Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
|
517 |
extended double-precision floating-point value, returning the result.
|
518 |
-------------------------------------------------------------------------------
|
519 |
*/
|
520 |
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) |
521 |
{ |
522 |
floatx80 z; |
523 |
|
524 |
z.low = zSig; |
525 |
z.high = ( ( (bits16) zSign )<<15 ) + zExp;
|
526 |
return z;
|
527 |
|
528 |
} |
529 |
|
530 |
/*
|
531 |
-------------------------------------------------------------------------------
|
532 |
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
|
533 |
and extended significand formed by the concatenation of `zSig0' and `zSig1',
|
534 |
and returns the proper extended double-precision floating-point value
|
535 |
corresponding to the abstract input. Ordinarily, the abstract value is
|
536 |
rounded and packed into the extended double-precision format, with the
|
537 |
inexact exception raised if the abstract input cannot be represented
|
538 |
exactly. If the abstract value is too large, however, the overflow and
|
539 |
inexact exceptions are raised and an infinity or maximal finite value is
|
540 |
returned. If the abstract value is too small, the input value is rounded to
|
541 |
a subnormal number, and the underflow and inexact exceptions are raised if
|
542 |
the abstract input cannot be represented exactly as a subnormal extended
|
543 |
double-precision floating-point number.
|
544 |
If `roundingPrecision' is 32 or 64, the result is rounded to the same
|
545 |
number of bits as single or double precision, respectively. Otherwise, the
|
546 |
result is rounded to the full precision of the extended double-precision
|
547 |
format.
|
548 |
The input significand must be normalized or smaller. If the input
|
549 |
significand is not normalized, `zExp' must be 0; in that case, the result
|
550 |
returned is a subnormal number, and it must not require rounding. The
|
551 |
handling of underflow and overflow follows the IEC/IEEE Standard for Binary
|
552 |
Floating-point Arithmetic.
|
553 |
-------------------------------------------------------------------------------
|
554 |
*/
|
555 |
static floatx80
|
556 |
roundAndPackFloatx80( |
557 |
int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 |
558 |
) |
559 |
{ |
560 |
int8 roundingMode; |
561 |
flag roundNearestEven, increment, isTiny; |
562 |
int64 roundIncrement, roundMask, roundBits; |
563 |
|
564 |
roundingMode = float_rounding_mode; |
565 |
roundNearestEven = ( roundingMode == float_round_nearest_even ); |
566 |
if ( roundingPrecision == 80 ) goto precision80; |
567 |
if ( roundingPrecision == 64 ) { |
568 |
roundIncrement = LIT64( 0x0000000000000400 );
|
569 |
roundMask = LIT64( 0x00000000000007FF );
|
570 |
} |
571 |
else if ( roundingPrecision == 32 ) { |
572 |
roundIncrement = LIT64( 0x0000008000000000 );
|
573 |
roundMask = LIT64( 0x000000FFFFFFFFFF );
|
574 |
} |
575 |
else {
|
576 |
goto precision80;
|
577 |
} |
578 |
zSig0 |= ( zSig1 != 0 );
|
579 |
if ( ! roundNearestEven ) {
|
580 |
if ( roundingMode == float_round_to_zero ) {
|
581 |
roundIncrement = 0;
|
582 |
} |
583 |
else {
|
584 |
roundIncrement = roundMask; |
585 |
if ( zSign ) {
|
586 |
if ( roundingMode == float_round_up ) roundIncrement = 0; |
587 |
} |
588 |
else {
|
589 |
if ( roundingMode == float_round_down ) roundIncrement = 0; |
590 |
} |
591 |
} |
592 |
} |
593 |
roundBits = zSig0 & roundMask; |
594 |
if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { |
595 |
if ( ( 0x7FFE < zExp ) |
596 |
|| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
|
597 |
) { |
598 |
goto overflow;
|
599 |
} |
600 |
if ( zExp <= 0 ) { |
601 |
isTiny = |
602 |
( float_detect_tininess == float_tininess_before_rounding ) |
603 |
|| ( zExp < 0 )
|
604 |
|| ( zSig0 <= zSig0 + roundIncrement ); |
605 |
shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
|
606 |
zExp = 0;
|
607 |
roundBits = zSig0 & roundMask; |
608 |
if ( isTiny && roundBits ) float_raise( float_flag_underflow );
|
609 |
if ( roundBits ) float_exception_flags |= float_flag_inexact;
|
610 |
zSig0 += roundIncrement; |
611 |
if ( (sbits64) zSig0 < 0 ) zExp = 1; |
612 |
roundIncrement = roundMask + 1;
|
613 |
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { |
614 |
roundMask |= roundIncrement; |
615 |
} |
616 |
zSig0 &= ~ roundMask; |
617 |
return packFloatx80( zSign, zExp, zSig0 );
|
618 |
} |
619 |
} |
620 |
if ( roundBits ) float_exception_flags |= float_flag_inexact;
|
621 |
zSig0 += roundIncrement; |
622 |
if ( zSig0 < roundIncrement ) {
|
623 |
++zExp; |
624 |
zSig0 = LIT64( 0x8000000000000000 );
|
625 |
} |
626 |
roundIncrement = roundMask + 1;
|
627 |
if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { |
628 |
roundMask |= roundIncrement; |
629 |
} |
630 |
zSig0 &= ~ roundMask; |
631 |
if ( zSig0 == 0 ) zExp = 0; |
632 |
return packFloatx80( zSign, zExp, zSig0 );
|
633 |
precision80:
|
634 |
increment = ( (sbits64) zSig1 < 0 );
|
635 |
if ( ! roundNearestEven ) {
|
636 |
if ( roundingMode == float_round_to_zero ) {
|
637 |
increment = 0;
|
638 |
} |
639 |
else {
|
640 |
if ( zSign ) {
|
641 |
increment = ( roundingMode == float_round_down ) && zSig1; |
642 |
} |
643 |
else {
|
644 |
increment = ( roundingMode == float_round_up ) && zSig1; |
645 |
} |
646 |
} |
647 |
} |
648 |
if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { |
649 |
if ( ( 0x7FFE < zExp ) |
650 |
|| ( ( zExp == 0x7FFE )
|
651 |
&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
|
652 |
&& increment |
653 |
) |
654 |
) { |
655 |
roundMask = 0;
|
656 |
overflow:
|
657 |
float_raise( float_flag_overflow | float_flag_inexact ); |
658 |
if ( ( roundingMode == float_round_to_zero )
|
659 |
|| ( zSign && ( roundingMode == float_round_up ) ) |
660 |
|| ( ! zSign && ( roundingMode == float_round_down ) ) |
661 |
) { |
662 |
return packFloatx80( zSign, 0x7FFE, ~ roundMask ); |
663 |
} |
664 |
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
665 |
} |
666 |
if ( zExp <= 0 ) { |
667 |
isTiny = |
668 |
( float_detect_tininess == float_tininess_before_rounding ) |
669 |
|| ( zExp < 0 )
|
670 |
|| ! increment |
671 |
|| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
|
672 |
shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
|
673 |
zExp = 0;
|
674 |
if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
|
675 |
if ( zSig1 ) float_exception_flags |= float_flag_inexact;
|
676 |
if ( roundNearestEven ) {
|
677 |
increment = ( (sbits64) zSig1 < 0 );
|
678 |
} |
679 |
else {
|
680 |
if ( zSign ) {
|
681 |
increment = ( roundingMode == float_round_down ) && zSig1; |
682 |
} |
683 |
else {
|
684 |
increment = ( roundingMode == float_round_up ) && zSig1; |
685 |
} |
686 |
} |
687 |
if ( increment ) {
|
688 |
++zSig0; |
689 |
zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
|
690 |
if ( (sbits64) zSig0 < 0 ) zExp = 1; |
691 |
} |
692 |
return packFloatx80( zSign, zExp, zSig0 );
|
693 |
} |
694 |
} |
695 |
if ( zSig1 ) float_exception_flags |= float_flag_inexact;
|
696 |
if ( increment ) {
|
697 |
++zSig0; |
698 |
if ( zSig0 == 0 ) { |
699 |
++zExp; |
700 |
zSig0 = LIT64( 0x8000000000000000 );
|
701 |
} |
702 |
else {
|
703 |
zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
|
704 |
} |
705 |
} |
706 |
else {
|
707 |
if ( zSig0 == 0 ) zExp = 0; |
708 |
} |
709 |
|
710 |
return packFloatx80( zSign, zExp, zSig0 );
|
711 |
} |
712 |
|
713 |
/*
|
714 |
-------------------------------------------------------------------------------
|
715 |
Takes an abstract floating-point value having sign `zSign', exponent
|
716 |
`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
|
717 |
and returns the proper extended double-precision floating-point value
|
718 |
corresponding to the abstract input. This routine is just like
|
719 |
`roundAndPackFloatx80' except that the input significand does not have to be
|
720 |
normalized.
|
721 |
-------------------------------------------------------------------------------
|
722 |
*/
|
723 |
static floatx80
|
724 |
normalizeRoundAndPackFloatx80( |
725 |
int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 |
726 |
) |
727 |
{ |
728 |
int8 shiftCount; |
729 |
|
730 |
if ( zSig0 == 0 ) { |
731 |
zSig0 = zSig1; |
732 |
zSig1 = 0;
|
733 |
zExp -= 64;
|
734 |
} |
735 |
shiftCount = countLeadingZeros64( zSig0 ); |
736 |
shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
737 |
zExp -= shiftCount; |
738 |
return
|
739 |
roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); |
740 |
|
741 |
} |
742 |
|
743 |
#endif
|
744 |
|
745 |
/*
|
746 |
-------------------------------------------------------------------------------
|
747 |
Returns the result of converting the 32-bit two's complement integer `a' to
|
748 |
the single-precision floating-point format. The conversion is performed
|
749 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
750 |
-------------------------------------------------------------------------------
|
751 |
*/
|
752 |
float32 int32_to_float32( int32 a ) |
753 |
{ |
754 |
flag zSign; |
755 |
|
756 |
if ( a == 0 ) return 0; |
757 |
if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); |
758 |
zSign = ( a < 0 );
|
759 |
return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a ); |
760 |
|
761 |
} |
762 |
|
763 |
/*
|
764 |
-------------------------------------------------------------------------------
|
765 |
Returns the result of converting the 32-bit two's complement integer `a' to
|
766 |
the double-precision floating-point format. The conversion is performed
|
767 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
768 |
-------------------------------------------------------------------------------
|
769 |
*/
|
770 |
float64 int32_to_float64( int32 a ) |
771 |
{ |
772 |
flag aSign; |
773 |
uint32 absA; |
774 |
int8 shiftCount; |
775 |
bits64 zSig; |
776 |
|
777 |
if ( a == 0 ) return 0; |
778 |
aSign = ( a < 0 );
|
779 |
absA = aSign ? - a : a; |
780 |
shiftCount = countLeadingZeros32( absA ) + 21;
|
781 |
zSig = absA; |
782 |
return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount ); |
783 |
|
784 |
} |
785 |
|
786 |
#ifdef FLOATX80
|
787 |
|
788 |
/*
|
789 |
-------------------------------------------------------------------------------
|
790 |
Returns the result of converting the 32-bit two's complement integer `a'
|
791 |
to the extended double-precision floating-point format. The conversion
|
792 |
is performed according to the IEC/IEEE Standard for Binary Floating-point
|
793 |
Arithmetic.
|
794 |
-------------------------------------------------------------------------------
|
795 |
*/
|
796 |
floatx80 int32_to_floatx80( int32 a ) |
797 |
{ |
798 |
flag zSign; |
799 |
uint32 absA; |
800 |
int8 shiftCount; |
801 |
bits64 zSig; |
802 |
|
803 |
if ( a == 0 ) return packFloatx80( 0, 0, 0 ); |
804 |
zSign = ( a < 0 );
|
805 |
absA = zSign ? - a : a; |
806 |
shiftCount = countLeadingZeros32( absA ) + 32;
|
807 |
zSig = absA; |
808 |
return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); |
809 |
|
810 |
} |
811 |
|
812 |
#endif
|
813 |
|
814 |
/*
|
815 |
-------------------------------------------------------------------------------
|
816 |
Returns the result of converting the single-precision floating-point value
|
817 |
`a' to the 32-bit two's complement integer format. The conversion is
|
818 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
819 |
Arithmetic---which means in particular that the conversion is rounded
|
820 |
according to the current rounding mode. If `a' is a NaN, the largest
|
821 |
positive integer is returned. Otherwise, if the conversion overflows, the
|
822 |
largest integer with the same sign as `a' is returned.
|
823 |
-------------------------------------------------------------------------------
|
824 |
*/
|
825 |
int32 float32_to_int32( float32 a ) |
826 |
{ |
827 |
flag aSign; |
828 |
int16 aExp, shiftCount; |
829 |
bits32 aSig; |
830 |
bits64 zSig; |
831 |
|
832 |
aSig = extractFloat32Frac( a ); |
833 |
aExp = extractFloat32Exp( a ); |
834 |
aSign = extractFloat32Sign( a ); |
835 |
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; |
836 |
if ( aExp ) aSig |= 0x00800000; |
837 |
shiftCount = 0xAF - aExp;
|
838 |
zSig = aSig; |
839 |
zSig <<= 32;
|
840 |
if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig ); |
841 |
return roundAndPackInt32( aSign, zSig );
|
842 |
|
843 |
} |
844 |
|
845 |
/*
|
846 |
-------------------------------------------------------------------------------
|
847 |
Returns the result of converting the single-precision floating-point value
|
848 |
`a' to the 32-bit two's complement integer format. The conversion is
|
849 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
850 |
Arithmetic, except that the conversion is always rounded toward zero. If
|
851 |
`a' is a NaN, the largest positive integer is returned. Otherwise, if the
|
852 |
conversion overflows, the largest integer with the same sign as `a' is
|
853 |
returned.
|
854 |
-------------------------------------------------------------------------------
|
855 |
*/
|
856 |
int32 float32_to_int32_round_to_zero( float32 a ) |
857 |
{ |
858 |
flag aSign; |
859 |
int16 aExp, shiftCount; |
860 |
bits32 aSig; |
861 |
int32 z; |
862 |
|
863 |
aSig = extractFloat32Frac( a ); |
864 |
aExp = extractFloat32Exp( a ); |
865 |
aSign = extractFloat32Sign( a ); |
866 |
shiftCount = aExp - 0x9E;
|
867 |
if ( 0 <= shiftCount ) { |
868 |
if ( a == 0xCF000000 ) return 0x80000000; |
869 |
float_raise( float_flag_invalid ); |
870 |
if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; |
871 |
return 0x80000000; |
872 |
} |
873 |
else if ( aExp <= 0x7E ) { |
874 |
if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
|
875 |
return 0; |
876 |
} |
877 |
aSig = ( aSig | 0x00800000 )<<8; |
878 |
z = aSig>>( - shiftCount ); |
879 |
if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { |
880 |
float_exception_flags |= float_flag_inexact; |
881 |
} |
882 |
return aSign ? - z : z;
|
883 |
|
884 |
} |
885 |
|
886 |
/*
|
887 |
-------------------------------------------------------------------------------
|
888 |
Returns the result of converting the single-precision floating-point value
|
889 |
`a' to the double-precision floating-point format. The conversion is
|
890 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
891 |
Arithmetic.
|
892 |
-------------------------------------------------------------------------------
|
893 |
*/
|
894 |
float64 float32_to_float64( float32 a ) |
895 |
{ |
896 |
flag aSign; |
897 |
int16 aExp; |
898 |
bits32 aSig; |
899 |
|
900 |
aSig = extractFloat32Frac( a ); |
901 |
aExp = extractFloat32Exp( a ); |
902 |
aSign = extractFloat32Sign( a ); |
903 |
if ( aExp == 0xFF ) { |
904 |
if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); |
905 |
return packFloat64( aSign, 0x7FF, 0 ); |
906 |
} |
907 |
if ( aExp == 0 ) { |
908 |
if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); |
909 |
normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
910 |
--aExp; |
911 |
} |
912 |
return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); |
913 |
|
914 |
} |
915 |
|
916 |
#ifdef FLOATX80
|
917 |
|
918 |
/*
|
919 |
-------------------------------------------------------------------------------
|
920 |
Returns the result of converting the single-precision floating-point value
|
921 |
`a' to the extended double-precision floating-point format. The conversion
|
922 |
is performed according to the IEC/IEEE Standard for Binary Floating-point
|
923 |
Arithmetic.
|
924 |
-------------------------------------------------------------------------------
|
925 |
*/
|
926 |
floatx80 float32_to_floatx80( float32 a ) |
927 |
{ |
928 |
flag aSign; |
929 |
int16 aExp; |
930 |
bits32 aSig; |
931 |
|
932 |
aSig = extractFloat32Frac( a ); |
933 |
aExp = extractFloat32Exp( a ); |
934 |
aSign = extractFloat32Sign( a ); |
935 |
if ( aExp == 0xFF ) { |
936 |
if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); |
937 |
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
938 |
} |
939 |
if ( aExp == 0 ) { |
940 |
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); |
941 |
normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
942 |
} |
943 |
aSig |= 0x00800000;
|
944 |
return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); |
945 |
|
946 |
} |
947 |
|
948 |
#endif
|
949 |
|
950 |
/*
|
951 |
-------------------------------------------------------------------------------
|
952 |
Rounds the single-precision floating-point value `a' to an integer, and
|
953 |
returns the result as a single-precision floating-point value. The
|
954 |
operation is performed according to the IEC/IEEE Standard for Binary
|
955 |
Floating-point Arithmetic.
|
956 |
-------------------------------------------------------------------------------
|
957 |
*/
|
958 |
float32 float32_round_to_int( float32 a ) |
959 |
{ |
960 |
flag aSign; |
961 |
int16 aExp; |
962 |
bits32 lastBitMask, roundBitsMask; |
963 |
int8 roundingMode; |
964 |
float32 z; |
965 |
|
966 |
aExp = extractFloat32Exp( a ); |
967 |
if ( 0x96 <= aExp ) { |
968 |
if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { |
969 |
return propagateFloat32NaN( a, a );
|
970 |
} |
971 |
return a;
|
972 |
} |
973 |
if ( aExp <= 0x7E ) { |
974 |
if ( (bits32) ( a<<1 ) == 0 ) return a; |
975 |
float_exception_flags |= float_flag_inexact; |
976 |
aSign = extractFloat32Sign( a ); |
977 |
switch ( float_rounding_mode ) {
|
978 |
case float_round_nearest_even:
|
979 |
if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { |
980 |
return packFloat32( aSign, 0x7F, 0 ); |
981 |
} |
982 |
break;
|
983 |
case float_round_down:
|
984 |
return aSign ? 0xBF800000 : 0; |
985 |
case float_round_up:
|
986 |
return aSign ? 0x80000000 : 0x3F800000; |
987 |
} |
988 |
return packFloat32( aSign, 0, 0 ); |
989 |
} |
990 |
lastBitMask = 1;
|
991 |
lastBitMask <<= 0x96 - aExp;
|
992 |
roundBitsMask = lastBitMask - 1;
|
993 |
z = a; |
994 |
roundingMode = float_rounding_mode; |
995 |
if ( roundingMode == float_round_nearest_even ) {
|
996 |
z += lastBitMask>>1;
|
997 |
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; |
998 |
} |
999 |
else if ( roundingMode != float_round_to_zero ) { |
1000 |
if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
|
1001 |
z += roundBitsMask; |
1002 |
} |
1003 |
} |
1004 |
z &= ~ roundBitsMask; |
1005 |
if ( z != a ) float_exception_flags |= float_flag_inexact;
|
1006 |
return z;
|
1007 |
|
1008 |
} |
1009 |
|
1010 |
/*
|
1011 |
-------------------------------------------------------------------------------
|
1012 |
Returns the result of adding the absolute values of the single-precision
|
1013 |
floating-point values `a' and `b'. If `zSign' is true, the sum is negated
|
1014 |
before being returned. `zSign' is ignored if the result is a NaN. The
|
1015 |
addition is performed according to the IEC/IEEE Standard for Binary
|
1016 |
Floating-point Arithmetic.
|
1017 |
-------------------------------------------------------------------------------
|
1018 |
*/
|
1019 |
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
|
1020 |
{ |
1021 |
int16 aExp, bExp, zExp; |
1022 |
bits32 aSig, bSig, zSig; |
1023 |
int16 expDiff; |
1024 |
|
1025 |
aSig = extractFloat32Frac( a ); |
1026 |
aExp = extractFloat32Exp( a ); |
1027 |
bSig = extractFloat32Frac( b ); |
1028 |
bExp = extractFloat32Exp( b ); |
1029 |
expDiff = aExp - bExp; |
1030 |
aSig <<= 6;
|
1031 |
bSig <<= 6;
|
1032 |
if ( 0 < expDiff ) { |
1033 |
if ( aExp == 0xFF ) { |
1034 |
if ( aSig ) return propagateFloat32NaN( a, b ); |
1035 |
return a;
|
1036 |
} |
1037 |
if ( bExp == 0 ) { |
1038 |
--expDiff; |
1039 |
} |
1040 |
else {
|
1041 |
bSig |= 0x20000000;
|
1042 |
} |
1043 |
shift32RightJamming( bSig, expDiff, &bSig ); |
1044 |
zExp = aExp; |
1045 |
} |
1046 |
else if ( expDiff < 0 ) { |
1047 |
if ( bExp == 0xFF ) { |
1048 |
if ( bSig ) return propagateFloat32NaN( a, b ); |
1049 |
return packFloat32( zSign, 0xFF, 0 ); |
1050 |
} |
1051 |
if ( aExp == 0 ) { |
1052 |
++expDiff; |
1053 |
} |
1054 |
else {
|
1055 |
aSig |= 0x20000000;
|
1056 |
} |
1057 |
shift32RightJamming( aSig, - expDiff, &aSig ); |
1058 |
zExp = bExp; |
1059 |
} |
1060 |
else {
|
1061 |
if ( aExp == 0xFF ) { |
1062 |
if ( aSig | bSig ) return propagateFloat32NaN( a, b ); |
1063 |
return a;
|
1064 |
} |
1065 |
if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); |
1066 |
zSig = 0x40000000 + aSig + bSig;
|
1067 |
zExp = aExp; |
1068 |
goto roundAndPack;
|
1069 |
} |
1070 |
aSig |= 0x20000000;
|
1071 |
zSig = ( aSig + bSig )<<1;
|
1072 |
--zExp; |
1073 |
if ( (sbits32) zSig < 0 ) { |
1074 |
zSig = aSig + bSig; |
1075 |
++zExp; |
1076 |
} |
1077 |
roundAndPack:
|
1078 |
return roundAndPackFloat32( zSign, zExp, zSig );
|
1079 |
|
1080 |
} |
1081 |
|
1082 |
/*
|
1083 |
-------------------------------------------------------------------------------
|
1084 |
Returns the result of subtracting the absolute values of the single-
|
1085 |
precision floating-point values `a' and `b'. If `zSign' is true, the
|
1086 |
difference is negated before being returned. `zSign' is ignored if the
|
1087 |
result is a NaN. The subtraction is performed according to the IEC/IEEE
|
1088 |
Standard for Binary Floating-point Arithmetic.
|
1089 |
-------------------------------------------------------------------------------
|
1090 |
*/
|
1091 |
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
|
1092 |
{ |
1093 |
int16 aExp, bExp, zExp; |
1094 |
bits32 aSig, bSig, zSig; |
1095 |
int16 expDiff; |
1096 |
|
1097 |
aSig = extractFloat32Frac( a ); |
1098 |
aExp = extractFloat32Exp( a ); |
1099 |
bSig = extractFloat32Frac( b ); |
1100 |
bExp = extractFloat32Exp( b ); |
1101 |
expDiff = aExp - bExp; |
1102 |
aSig <<= 7;
|
1103 |
bSig <<= 7;
|
1104 |
if ( 0 < expDiff ) goto aExpBigger; |
1105 |
if ( expDiff < 0 ) goto bExpBigger; |
1106 |
if ( aExp == 0xFF ) { |
1107 |
if ( aSig | bSig ) return propagateFloat32NaN( a, b ); |
1108 |
float_raise( float_flag_invalid ); |
1109 |
return float32_default_nan;
|
1110 |
} |
1111 |
if ( aExp == 0 ) { |
1112 |
aExp = 1;
|
1113 |
bExp = 1;
|
1114 |
} |
1115 |
if ( bSig < aSig ) goto aBigger; |
1116 |
if ( aSig < bSig ) goto bBigger; |
1117 |
return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); |
1118 |
bExpBigger:
|
1119 |
if ( bExp == 0xFF ) { |
1120 |
if ( bSig ) return propagateFloat32NaN( a, b ); |
1121 |
return packFloat32( zSign ^ 1, 0xFF, 0 ); |
1122 |
} |
1123 |
if ( aExp == 0 ) { |
1124 |
++expDiff; |
1125 |
} |
1126 |
else {
|
1127 |
aSig |= 0x40000000;
|
1128 |
} |
1129 |
shift32RightJamming( aSig, - expDiff, &aSig ); |
1130 |
bSig |= 0x40000000;
|
1131 |
bBigger:
|
1132 |
zSig = bSig - aSig; |
1133 |
zExp = bExp; |
1134 |
zSign ^= 1;
|
1135 |
goto normalizeRoundAndPack;
|
1136 |
aExpBigger:
|
1137 |
if ( aExp == 0xFF ) { |
1138 |
if ( aSig ) return propagateFloat32NaN( a, b ); |
1139 |
return a;
|
1140 |
} |
1141 |
if ( bExp == 0 ) { |
1142 |
--expDiff; |
1143 |
} |
1144 |
else {
|
1145 |
bSig |= 0x40000000;
|
1146 |
} |
1147 |
shift32RightJamming( bSig, expDiff, &bSig ); |
1148 |
aSig |= 0x40000000;
|
1149 |
aBigger:
|
1150 |
zSig = aSig - bSig; |
1151 |
zExp = aExp; |
1152 |
normalizeRoundAndPack:
|
1153 |
--zExp; |
1154 |
return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
|
1155 |
|
1156 |
} |
1157 |
|
1158 |
/*
|
1159 |
-------------------------------------------------------------------------------
|
1160 |
Returns the result of adding the single-precision floating-point values `a'
|
1161 |
and `b'. The operation is performed according to the IEC/IEEE Standard for
|
1162 |
Binary Floating-point Arithmetic.
|
1163 |
-------------------------------------------------------------------------------
|
1164 |
*/
|
1165 |
float32 float32_add( float32 a, float32 b ) |
1166 |
{ |
1167 |
flag aSign, bSign; |
1168 |
|
1169 |
aSign = extractFloat32Sign( a ); |
1170 |
bSign = extractFloat32Sign( b ); |
1171 |
if ( aSign == bSign ) {
|
1172 |
return addFloat32Sigs( a, b, aSign );
|
1173 |
} |
1174 |
else {
|
1175 |
return subFloat32Sigs( a, b, aSign );
|
1176 |
} |
1177 |
|
1178 |
} |
1179 |
|
1180 |
/*
|
1181 |
-------------------------------------------------------------------------------
|
1182 |
Returns the result of subtracting the single-precision floating-point values
|
1183 |
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
|
1184 |
for Binary Floating-point Arithmetic.
|
1185 |
-------------------------------------------------------------------------------
|
1186 |
*/
|
1187 |
float32 float32_sub( float32 a, float32 b ) |
1188 |
{ |
1189 |
flag aSign, bSign; |
1190 |
|
1191 |
aSign = extractFloat32Sign( a ); |
1192 |
bSign = extractFloat32Sign( b ); |
1193 |
if ( aSign == bSign ) {
|
1194 |
return subFloat32Sigs( a, b, aSign );
|
1195 |
} |
1196 |
else {
|
1197 |
return addFloat32Sigs( a, b, aSign );
|
1198 |
} |
1199 |
|
1200 |
} |
1201 |
|
1202 |
/*
|
1203 |
-------------------------------------------------------------------------------
|
1204 |
Returns the result of multiplying the single-precision floating-point values
|
1205 |
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
|
1206 |
for Binary Floating-point Arithmetic.
|
1207 |
-------------------------------------------------------------------------------
|
1208 |
*/
|
1209 |
float32 float32_mul( float32 a, float32 b ) |
1210 |
{ |
1211 |
flag aSign, bSign, zSign; |
1212 |
int16 aExp, bExp, zExp; |
1213 |
bits32 aSig, bSig; |
1214 |
bits64 zSig64; |
1215 |
bits32 zSig; |
1216 |
|
1217 |
aSig = extractFloat32Frac( a ); |
1218 |
aExp = extractFloat32Exp( a ); |
1219 |
aSign = extractFloat32Sign( a ); |
1220 |
bSig = extractFloat32Frac( b ); |
1221 |
bExp = extractFloat32Exp( b ); |
1222 |
bSign = extractFloat32Sign( b ); |
1223 |
zSign = aSign ^ bSign; |
1224 |
if ( aExp == 0xFF ) { |
1225 |
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { |
1226 |
return propagateFloat32NaN( a, b );
|
1227 |
} |
1228 |
if ( ( bExp | bSig ) == 0 ) { |
1229 |
float_raise( float_flag_invalid ); |
1230 |
return float32_default_nan;
|
1231 |
} |
1232 |
return packFloat32( zSign, 0xFF, 0 ); |
1233 |
} |
1234 |
if ( bExp == 0xFF ) { |
1235 |
if ( bSig ) return propagateFloat32NaN( a, b ); |
1236 |
if ( ( aExp | aSig ) == 0 ) { |
1237 |
float_raise( float_flag_invalid ); |
1238 |
return float32_default_nan;
|
1239 |
} |
1240 |
return packFloat32( zSign, 0xFF, 0 ); |
1241 |
} |
1242 |
if ( aExp == 0 ) { |
1243 |
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); |
1244 |
normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1245 |
} |
1246 |
if ( bExp == 0 ) { |
1247 |
if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); |
1248 |
normalizeFloat32Subnormal( bSig, &bExp, &bSig ); |
1249 |
} |
1250 |
zExp = aExp + bExp - 0x7F;
|
1251 |
aSig = ( aSig | 0x00800000 )<<7; |
1252 |
bSig = ( bSig | 0x00800000 )<<8; |
1253 |
shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
|
1254 |
zSig = zSig64; |
1255 |
if ( 0 <= (sbits32) ( zSig<<1 ) ) { |
1256 |
zSig <<= 1;
|
1257 |
--zExp; |
1258 |
} |
1259 |
return roundAndPackFloat32( zSign, zExp, zSig );
|
1260 |
|
1261 |
} |
1262 |
|
1263 |
/*
|
1264 |
-------------------------------------------------------------------------------
|
1265 |
Returns the result of dividing the single-precision floating-point value `a'
|
1266 |
by the corresponding value `b'. The operation is performed according to the
|
1267 |
IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
1268 |
-------------------------------------------------------------------------------
|
1269 |
*/
|
1270 |
float32 float32_div( float32 a, float32 b ) |
1271 |
{ |
1272 |
flag aSign, bSign, zSign; |
1273 |
int16 aExp, bExp, zExp; |
1274 |
bits32 aSig, bSig, zSig; |
1275 |
|
1276 |
aSig = extractFloat32Frac( a ); |
1277 |
aExp = extractFloat32Exp( a ); |
1278 |
aSign = extractFloat32Sign( a ); |
1279 |
bSig = extractFloat32Frac( b ); |
1280 |
bExp = extractFloat32Exp( b ); |
1281 |
bSign = extractFloat32Sign( b ); |
1282 |
zSign = aSign ^ bSign; |
1283 |
if ( aExp == 0xFF ) { |
1284 |
if ( aSig ) return propagateFloat32NaN( a, b ); |
1285 |
if ( bExp == 0xFF ) { |
1286 |
if ( bSig ) return propagateFloat32NaN( a, b ); |
1287 |
float_raise( float_flag_invalid ); |
1288 |
return float32_default_nan;
|
1289 |
} |
1290 |
return packFloat32( zSign, 0xFF, 0 ); |
1291 |
} |
1292 |
if ( bExp == 0xFF ) { |
1293 |
if ( bSig ) return propagateFloat32NaN( a, b ); |
1294 |
return packFloat32( zSign, 0, 0 ); |
1295 |
} |
1296 |
if ( bExp == 0 ) { |
1297 |
if ( bSig == 0 ) { |
1298 |
if ( ( aExp | aSig ) == 0 ) { |
1299 |
float_raise( float_flag_invalid ); |
1300 |
return float32_default_nan;
|
1301 |
} |
1302 |
float_raise( float_flag_divbyzero ); |
1303 |
return packFloat32( zSign, 0xFF, 0 ); |
1304 |
} |
1305 |
normalizeFloat32Subnormal( bSig, &bExp, &bSig ); |
1306 |
} |
1307 |
if ( aExp == 0 ) { |
1308 |
if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); |
1309 |
normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1310 |
} |
1311 |
zExp = aExp - bExp + 0x7D;
|
1312 |
aSig = ( aSig | 0x00800000 )<<7; |
1313 |
bSig = ( bSig | 0x00800000 )<<8; |
1314 |
if ( bSig <= ( aSig + aSig ) ) {
|
1315 |
aSig >>= 1;
|
1316 |
++zExp; |
1317 |
} |
1318 |
zSig = ( ( (bits64) aSig )<<32 ) / bSig;
|
1319 |
if ( ( zSig & 0x3F ) == 0 ) { |
1320 |
zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
|
1321 |
} |
1322 |
return roundAndPackFloat32( zSign, zExp, zSig );
|
1323 |
|
1324 |
} |
1325 |
|
1326 |
/*
|
1327 |
-------------------------------------------------------------------------------
|
1328 |
Returns the remainder of the single-precision floating-point value `a'
|
1329 |
with respect to the corresponding value `b'. The operation is performed
|
1330 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
1331 |
-------------------------------------------------------------------------------
|
1332 |
*/
|
1333 |
float32 float32_rem( float32 a, float32 b ) |
1334 |
{ |
1335 |
flag aSign, bSign, zSign; |
1336 |
int16 aExp, bExp, expDiff; |
1337 |
bits32 aSig, bSig; |
1338 |
bits32 q; |
1339 |
bits64 aSig64, bSig64, q64; |
1340 |
bits32 alternateASig; |
1341 |
sbits32 sigMean; |
1342 |
|
1343 |
aSig = extractFloat32Frac( a ); |
1344 |
aExp = extractFloat32Exp( a ); |
1345 |
aSign = extractFloat32Sign( a ); |
1346 |
bSig = extractFloat32Frac( b ); |
1347 |
bExp = extractFloat32Exp( b ); |
1348 |
bSign = extractFloat32Sign( b ); |
1349 |
if ( aExp == 0xFF ) { |
1350 |
if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { |
1351 |
return propagateFloat32NaN( a, b );
|
1352 |
} |
1353 |
float_raise( float_flag_invalid ); |
1354 |
return float32_default_nan;
|
1355 |
} |
1356 |
if ( bExp == 0xFF ) { |
1357 |
if ( bSig ) return propagateFloat32NaN( a, b ); |
1358 |
return a;
|
1359 |
} |
1360 |
if ( bExp == 0 ) { |
1361 |
if ( bSig == 0 ) { |
1362 |
float_raise( float_flag_invalid ); |
1363 |
return float32_default_nan;
|
1364 |
} |
1365 |
normalizeFloat32Subnormal( bSig, &bExp, &bSig ); |
1366 |
} |
1367 |
if ( aExp == 0 ) { |
1368 |
if ( aSig == 0 ) return a; |
1369 |
normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1370 |
} |
1371 |
expDiff = aExp - bExp; |
1372 |
aSig |= 0x00800000;
|
1373 |
bSig |= 0x00800000;
|
1374 |
if ( expDiff < 32 ) { |
1375 |
aSig <<= 8;
|
1376 |
bSig <<= 8;
|
1377 |
if ( expDiff < 0 ) { |
1378 |
if ( expDiff < -1 ) return a; |
1379 |
aSig >>= 1;
|
1380 |
} |
1381 |
q = ( bSig <= aSig ); |
1382 |
if ( q ) aSig -= bSig;
|
1383 |
if ( 0 < expDiff ) { |
1384 |
q = ( ( (bits64) aSig )<<32 ) / bSig;
|
1385 |
q >>= 32 - expDiff;
|
1386 |
bSig >>= 2;
|
1387 |
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; |
1388 |
} |
1389 |
else {
|
1390 |
aSig >>= 2;
|
1391 |
bSig >>= 2;
|
1392 |
} |
1393 |
} |
1394 |
else {
|
1395 |
if ( bSig <= aSig ) aSig -= bSig;
|
1396 |
aSig64 = ( (bits64) aSig )<<40;
|
1397 |
bSig64 = ( (bits64) bSig )<<40;
|
1398 |
expDiff -= 64;
|
1399 |
while ( 0 < expDiff ) { |
1400 |
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
|
1401 |
q64 = ( 2 < q64 ) ? q64 - 2 : 0; |
1402 |
aSig64 = - ( ( bSig * q64 )<<38 );
|
1403 |
expDiff -= 62;
|
1404 |
} |
1405 |
expDiff += 64;
|
1406 |
q64 = estimateDiv128To64( aSig64, 0, bSig64 );
|
1407 |
q64 = ( 2 < q64 ) ? q64 - 2 : 0; |
1408 |
q = q64>>( 64 - expDiff );
|
1409 |
bSig <<= 6;
|
1410 |
aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; |
1411 |
} |
1412 |
do {
|
1413 |
alternateASig = aSig; |
1414 |
++q; |
1415 |
aSig -= bSig; |
1416 |
} while ( 0 <= (sbits32) aSig ); |
1417 |
sigMean = aSig + alternateASig; |
1418 |
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { |
1419 |
aSig = alternateASig; |
1420 |
} |
1421 |
zSign = ( (sbits32) aSig < 0 );
|
1422 |
if ( zSign ) aSig = - aSig;
|
1423 |
return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
|
1424 |
|
1425 |
} |
1426 |
|
1427 |
/*
|
1428 |
-------------------------------------------------------------------------------
|
1429 |
Returns the square root of the single-precision floating-point value `a'.
|
1430 |
The operation is performed according to the IEC/IEEE Standard for Binary
|
1431 |
Floating-point Arithmetic.
|
1432 |
-------------------------------------------------------------------------------
|
1433 |
*/
|
1434 |
float32 float32_sqrt( float32 a ) |
1435 |
{ |
1436 |
flag aSign; |
1437 |
int16 aExp, zExp; |
1438 |
bits32 aSig, zSig; |
1439 |
bits64 rem, term; |
1440 |
|
1441 |
aSig = extractFloat32Frac( a ); |
1442 |
aExp = extractFloat32Exp( a ); |
1443 |
aSign = extractFloat32Sign( a ); |
1444 |
if ( aExp == 0xFF ) { |
1445 |
if ( aSig ) return propagateFloat32NaN( a, 0 ); |
1446 |
if ( ! aSign ) return a; |
1447 |
float_raise( float_flag_invalid ); |
1448 |
return float32_default_nan;
|
1449 |
} |
1450 |
if ( aSign ) {
|
1451 |
if ( ( aExp | aSig ) == 0 ) return a; |
1452 |
float_raise( float_flag_invalid ); |
1453 |
return float32_default_nan;
|
1454 |
} |
1455 |
if ( aExp == 0 ) { |
1456 |
if ( aSig == 0 ) return 0; |
1457 |
normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1458 |
} |
1459 |
zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; |
1460 |
aSig = ( aSig | 0x00800000 )<<8; |
1461 |
zSig = estimateSqrt32( aExp, aSig ) + 2;
|
1462 |
if ( ( zSig & 0x7F ) <= 5 ) { |
1463 |
if ( zSig < 2 ) { |
1464 |
zSig = 0xFFFFFFFF;
|
1465 |
} |
1466 |
else {
|
1467 |
aSig >>= aExp & 1;
|
1468 |
term = ( (bits64) zSig ) * zSig; |
1469 |
rem = ( ( (bits64) aSig )<<32 ) - term;
|
1470 |
while ( (sbits64) rem < 0 ) { |
1471 |
--zSig; |
1472 |
rem += ( ( (bits64) zSig )<<1 ) | 1; |
1473 |
} |
1474 |
zSig |= ( rem != 0 );
|
1475 |
} |
1476 |
} |
1477 |
shift32RightJamming( zSig, 1, &zSig );
|
1478 |
return roundAndPackFloat32( 0, zExp, zSig ); |
1479 |
|
1480 |
} |
1481 |
|
1482 |
/*
|
1483 |
-------------------------------------------------------------------------------
|
1484 |
Returns 1 if the single-precision floating-point value `a' is equal to the
|
1485 |
corresponding value `b', and 0 otherwise. The comparison is performed
|
1486 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
1487 |
-------------------------------------------------------------------------------
|
1488 |
*/
|
1489 |
flag float32_eq( float32 a, float32 b ) |
1490 |
{ |
1491 |
|
1492 |
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
1493 |
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
|
1494 |
) { |
1495 |
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
|
1496 |
float_raise( float_flag_invalid ); |
1497 |
} |
1498 |
return 0; |
1499 |
} |
1500 |
return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); |
1501 |
|
1502 |
} |
1503 |
|
1504 |
/*
|
1505 |
-------------------------------------------------------------------------------
|
1506 |
Returns 1 if the single-precision floating-point value `a' is less than or
|
1507 |
equal to the corresponding value `b', and 0 otherwise. The comparison is
|
1508 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
1509 |
Arithmetic.
|
1510 |
-------------------------------------------------------------------------------
|
1511 |
*/
|
1512 |
flag float32_le( float32 a, float32 b ) |
1513 |
{ |
1514 |
flag aSign, bSign; |
1515 |
|
1516 |
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
1517 |
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
|
1518 |
) { |
1519 |
float_raise( float_flag_invalid ); |
1520 |
return 0; |
1521 |
} |
1522 |
aSign = extractFloat32Sign( a ); |
1523 |
bSign = extractFloat32Sign( b ); |
1524 |
if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); |
1525 |
return ( a == b ) || ( aSign ^ ( a < b ) );
|
1526 |
|
1527 |
} |
1528 |
|
1529 |
/*
|
1530 |
-------------------------------------------------------------------------------
|
1531 |
Returns 1 if the single-precision floating-point value `a' is less than
|
1532 |
the corresponding value `b', and 0 otherwise. The comparison is performed
|
1533 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
1534 |
-------------------------------------------------------------------------------
|
1535 |
*/
|
1536 |
flag float32_lt( float32 a, float32 b ) |
1537 |
{ |
1538 |
flag aSign, bSign; |
1539 |
|
1540 |
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
1541 |
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
|
1542 |
) { |
1543 |
float_raise( float_flag_invalid ); |
1544 |
return 0; |
1545 |
} |
1546 |
aSign = extractFloat32Sign( a ); |
1547 |
bSign = extractFloat32Sign( b ); |
1548 |
if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); |
1549 |
return ( a != b ) && ( aSign ^ ( a < b ) );
|
1550 |
|
1551 |
} |
1552 |
|
1553 |
/*
|
1554 |
-------------------------------------------------------------------------------
|
1555 |
Returns 1 if the single-precision floating-point value `a' is equal to the
|
1556 |
corresponding value `b', and 0 otherwise. The invalid exception is raised
|
1557 |
if either operand is a NaN. Otherwise, the comparison is performed
|
1558 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
1559 |
-------------------------------------------------------------------------------
|
1560 |
*/
|
1561 |
flag float32_eq_signaling( float32 a, float32 b ) |
1562 |
{ |
1563 |
|
1564 |
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
1565 |
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
|
1566 |
) { |
1567 |
float_raise( float_flag_invalid ); |
1568 |
return 0; |
1569 |
} |
1570 |
return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); |
1571 |
|
1572 |
} |
1573 |
|
1574 |
/*
|
1575 |
-------------------------------------------------------------------------------
|
1576 |
Returns 1 if the single-precision floating-point value `a' is less than or
|
1577 |
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
|
1578 |
cause an exception. Otherwise, the comparison is performed according to the
|
1579 |
IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
1580 |
-------------------------------------------------------------------------------
|
1581 |
*/
|
1582 |
flag float32_le_quiet( float32 a, float32 b ) |
1583 |
{ |
1584 |
flag aSign, bSign; |
1585 |
//int16 aExp, bExp;
|
1586 |
|
1587 |
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
1588 |
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
|
1589 |
) { |
1590 |
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
|
1591 |
float_raise( float_flag_invalid ); |
1592 |
} |
1593 |
return 0; |
1594 |
} |
1595 |
aSign = extractFloat32Sign( a ); |
1596 |
bSign = extractFloat32Sign( b ); |
1597 |
if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); |
1598 |
return ( a == b ) || ( aSign ^ ( a < b ) );
|
1599 |
|
1600 |
} |
1601 |
|
1602 |
/*
|
1603 |
-------------------------------------------------------------------------------
|
1604 |
Returns 1 if the single-precision floating-point value `a' is less than
|
1605 |
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
|
1606 |
exception. Otherwise, the comparison is performed according to the IEC/IEEE
|
1607 |
Standard for Binary Floating-point Arithmetic.
|
1608 |
-------------------------------------------------------------------------------
|
1609 |
*/
|
1610 |
flag float32_lt_quiet( float32 a, float32 b ) |
1611 |
{ |
1612 |
flag aSign, bSign; |
1613 |
|
1614 |
if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
1615 |
|| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
|
1616 |
) { |
1617 |
if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
|
1618 |
float_raise( float_flag_invalid ); |
1619 |
} |
1620 |
return 0; |
1621 |
} |
1622 |
aSign = extractFloat32Sign( a ); |
1623 |
bSign = extractFloat32Sign( b ); |
1624 |
if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); |
1625 |
return ( a != b ) && ( aSign ^ ( a < b ) );
|
1626 |
|
1627 |
} |
1628 |
|
1629 |
/*
|
1630 |
-------------------------------------------------------------------------------
|
1631 |
Returns the result of converting the double-precision floating-point value
|
1632 |
`a' to the 32-bit two's complement integer format. The conversion is
|
1633 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
1634 |
Arithmetic---which means in particular that the conversion is rounded
|
1635 |
according to the current rounding mode. If `a' is a NaN, the largest
|
1636 |
positive integer is returned. Otherwise, if the conversion overflows, the
|
1637 |
largest integer with the same sign as `a' is returned.
|
1638 |
-------------------------------------------------------------------------------
|
1639 |
*/
|
1640 |
int32 float64_to_int32( float64 a ) |
1641 |
{ |
1642 |
flag aSign; |
1643 |
int16 aExp, shiftCount; |
1644 |
bits64 aSig; |
1645 |
|
1646 |
aSig = extractFloat64Frac( a ); |
1647 |
aExp = extractFloat64Exp( a ); |
1648 |
aSign = extractFloat64Sign( a ); |
1649 |
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; |
1650 |
if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); |
1651 |
shiftCount = 0x42C - aExp;
|
1652 |
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); |
1653 |
return roundAndPackInt32( aSign, aSig );
|
1654 |
|
1655 |
} |
1656 |
|
1657 |
/*
|
1658 |
-------------------------------------------------------------------------------
|
1659 |
Returns the result of converting the double-precision floating-point value
|
1660 |
`a' to the 32-bit two's complement integer format. The conversion is
|
1661 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
1662 |
Arithmetic, except that the conversion is always rounded toward zero. If
|
1663 |
`a' is a NaN, the largest positive integer is returned. Otherwise, if the
|
1664 |
conversion overflows, the largest integer with the same sign as `a' is
|
1665 |
returned.
|
1666 |
-------------------------------------------------------------------------------
|
1667 |
*/
|
1668 |
int32 float64_to_int32_round_to_zero( float64 a ) |
1669 |
{ |
1670 |
flag aSign; |
1671 |
int16 aExp, shiftCount; |
1672 |
bits64 aSig, savedASig; |
1673 |
int32 z; |
1674 |
|
1675 |
aSig = extractFloat64Frac( a ); |
1676 |
aExp = extractFloat64Exp( a ); |
1677 |
aSign = extractFloat64Sign( a ); |
1678 |
shiftCount = 0x433 - aExp;
|
1679 |
if ( shiftCount < 21 ) { |
1680 |
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; |
1681 |
goto invalid;
|
1682 |
} |
1683 |
else if ( 52 < shiftCount ) { |
1684 |
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
|
1685 |
return 0; |
1686 |
} |
1687 |
aSig |= LIT64( 0x0010000000000000 );
|
1688 |
savedASig = aSig; |
1689 |
aSig >>= shiftCount; |
1690 |
z = aSig; |
1691 |
if ( aSign ) z = - z;
|
1692 |
if ( ( z < 0 ) ^ aSign ) { |
1693 |
invalid:
|
1694 |
float_exception_flags |= float_flag_invalid; |
1695 |
return aSign ? 0x80000000 : 0x7FFFFFFF; |
1696 |
} |
1697 |
if ( ( aSig<<shiftCount ) != savedASig ) {
|
1698 |
float_exception_flags |= float_flag_inexact; |
1699 |
} |
1700 |
return z;
|
1701 |
|
1702 |
} |
1703 |
|
1704 |
/*
|
1705 |
-------------------------------------------------------------------------------
|
1706 |
Returns the result of converting the double-precision floating-point value
|
1707 |
`a' to the 32-bit two's complement unsigned integer format. The conversion
|
1708 |
is performed according to the IEC/IEEE Standard for Binary Floating-point
|
1709 |
Arithmetic---which means in particular that the conversion is rounded
|
1710 |
according to the current rounding mode. If `a' is a NaN, the largest
|
1711 |
positive integer is returned. Otherwise, if the conversion overflows, the
|
1712 |
largest positive integer is returned.
|
1713 |
-------------------------------------------------------------------------------
|
1714 |
*/
|
1715 |
int32 float64_to_uint32( float64 a ) |
1716 |
{ |
1717 |
flag aSign; |
1718 |
int16 aExp, shiftCount; |
1719 |
bits64 aSig; |
1720 |
|
1721 |
aSig = extractFloat64Frac( a ); |
1722 |
aExp = extractFloat64Exp( a ); |
1723 |
aSign = 0; //extractFloat64Sign( a ); |
1724 |
//if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
|
1725 |
if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); |
1726 |
shiftCount = 0x42C - aExp;
|
1727 |
if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); |
1728 |
return roundAndPackInt32( aSign, aSig );
|
1729 |
} |
1730 |
|
1731 |
/*
|
1732 |
-------------------------------------------------------------------------------
|
1733 |
Returns the result of converting the double-precision floating-point value
|
1734 |
`a' to the 32-bit two's complement integer format. The conversion is
|
1735 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
1736 |
Arithmetic, except that the conversion is always rounded toward zero. If
|
1737 |
`a' is a NaN, the largest positive integer is returned. Otherwise, if the
|
1738 |
conversion overflows, the largest positive integer is returned.
|
1739 |
-------------------------------------------------------------------------------
|
1740 |
*/
|
1741 |
int32 float64_to_uint32_round_to_zero( float64 a ) |
1742 |
{ |
1743 |
flag aSign; |
1744 |
int16 aExp, shiftCount; |
1745 |
bits64 aSig, savedASig; |
1746 |
int32 z; |
1747 |
|
1748 |
aSig = extractFloat64Frac( a ); |
1749 |
aExp = extractFloat64Exp( a ); |
1750 |
aSign = extractFloat64Sign( a ); |
1751 |
shiftCount = 0x433 - aExp;
|
1752 |
if ( shiftCount < 21 ) { |
1753 |
if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; |
1754 |
goto invalid;
|
1755 |
} |
1756 |
else if ( 52 < shiftCount ) { |
1757 |
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
|
1758 |
return 0; |
1759 |
} |
1760 |
aSig |= LIT64( 0x0010000000000000 );
|
1761 |
savedASig = aSig; |
1762 |
aSig >>= shiftCount; |
1763 |
z = aSig; |
1764 |
if ( aSign ) z = - z;
|
1765 |
if ( ( z < 0 ) ^ aSign ) { |
1766 |
invalid:
|
1767 |
float_exception_flags |= float_flag_invalid; |
1768 |
return aSign ? 0x80000000 : 0x7FFFFFFF; |
1769 |
} |
1770 |
if ( ( aSig<<shiftCount ) != savedASig ) {
|
1771 |
float_exception_flags |= float_flag_inexact; |
1772 |
} |
1773 |
return z;
|
1774 |
} |
1775 |
|
1776 |
/*
|
1777 |
-------------------------------------------------------------------------------
|
1778 |
Returns the result of converting the double-precision floating-point value
|
1779 |
`a' to the single-precision floating-point format. The conversion is
|
1780 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
1781 |
Arithmetic.
|
1782 |
-------------------------------------------------------------------------------
|
1783 |
*/
|
1784 |
float32 float64_to_float32( float64 a ) |
1785 |
{ |
1786 |
flag aSign; |
1787 |
int16 aExp; |
1788 |
bits64 aSig; |
1789 |
bits32 zSig; |
1790 |
|
1791 |
aSig = extractFloat64Frac( a ); |
1792 |
aExp = extractFloat64Exp( a ); |
1793 |
aSign = extractFloat64Sign( a ); |
1794 |
if ( aExp == 0x7FF ) { |
1795 |
if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); |
1796 |
return packFloat32( aSign, 0xFF, 0 ); |
1797 |
} |
1798 |
shift64RightJamming( aSig, 22, &aSig );
|
1799 |
zSig = aSig; |
1800 |
if ( aExp || zSig ) {
|
1801 |
zSig |= 0x40000000;
|
1802 |
aExp -= 0x381;
|
1803 |
} |
1804 |
return roundAndPackFloat32( aSign, aExp, zSig );
|
1805 |
|
1806 |
} |
1807 |
|
1808 |
#ifdef FLOATX80
|
1809 |
|
1810 |
/*
|
1811 |
-------------------------------------------------------------------------------
|
1812 |
Returns the result of converting the double-precision floating-point value
|
1813 |
`a' to the extended double-precision floating-point format. The conversion
|
1814 |
is performed according to the IEC/IEEE Standard for Binary Floating-point
|
1815 |
Arithmetic.
|
1816 |
-------------------------------------------------------------------------------
|
1817 |
*/
|
1818 |
floatx80 float64_to_floatx80( float64 a ) |
1819 |
{ |
1820 |
flag aSign; |
1821 |
int16 aExp; |
1822 |
bits64 aSig; |
1823 |
|
1824 |
aSig = extractFloat64Frac( a ); |
1825 |
aExp = extractFloat64Exp( a ); |
1826 |
aSign = extractFloat64Sign( a ); |
1827 |
if ( aExp == 0x7FF ) { |
1828 |
if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); |
1829 |
return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
1830 |
} |
1831 |
if ( aExp == 0 ) { |
1832 |
if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); |
1833 |
normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
1834 |
} |
1835 |
return
|
1836 |
packFloatx80( |
1837 |
aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); |
1838 |
|
1839 |
} |
1840 |
|
1841 |
#endif
|
1842 |
|
1843 |
/*
|
1844 |
-------------------------------------------------------------------------------
|
1845 |
Rounds the double-precision floating-point value `a' to an integer, and
|
1846 |
returns the result as a double-precision floating-point value. The
|
1847 |
operation is performed according to the IEC/IEEE Standard for Binary
|
1848 |
Floating-point Arithmetic.
|
1849 |
-------------------------------------------------------------------------------
|
1850 |
*/
|
1851 |
float64 float64_round_to_int( float64 a ) |
1852 |
{ |
1853 |
flag aSign; |
1854 |
int16 aExp; |
1855 |
bits64 lastBitMask, roundBitsMask; |
1856 |
int8 roundingMode; |
1857 |
float64 z; |
1858 |
|
1859 |
aExp = extractFloat64Exp( a ); |
1860 |
if ( 0x433 <= aExp ) { |
1861 |
if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { |
1862 |
return propagateFloat64NaN( a, a );
|
1863 |
} |
1864 |
return a;
|
1865 |
} |
1866 |
if ( aExp <= 0x3FE ) { |
1867 |
if ( (bits64) ( a<<1 ) == 0 ) return a; |
1868 |
float_exception_flags |= float_flag_inexact; |
1869 |
aSign = extractFloat64Sign( a ); |
1870 |
switch ( float_rounding_mode ) {
|
1871 |
case float_round_nearest_even:
|
1872 |
if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { |
1873 |
return packFloat64( aSign, 0x3FF, 0 ); |
1874 |
} |
1875 |
break;
|
1876 |
case float_round_down:
|
1877 |
return aSign ? LIT64( 0xBFF0000000000000 ) : 0; |
1878 |
case float_round_up:
|
1879 |
return
|
1880 |
aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); |
1881 |
} |
1882 |
return packFloat64( aSign, 0, 0 ); |
1883 |
} |
1884 |
lastBitMask = 1;
|
1885 |
lastBitMask <<= 0x433 - aExp;
|
1886 |
roundBitsMask = lastBitMask - 1;
|
1887 |
z = a; |
1888 |
roundingMode = float_rounding_mode; |
1889 |
if ( roundingMode == float_round_nearest_even ) {
|
1890 |
z += lastBitMask>>1;
|
1891 |
if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; |
1892 |
} |
1893 |
else if ( roundingMode != float_round_to_zero ) { |
1894 |
if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
|
1895 |
z += roundBitsMask; |
1896 |
} |
1897 |
} |
1898 |
z &= ~ roundBitsMask; |
1899 |
if ( z != a ) float_exception_flags |= float_flag_inexact;
|
1900 |
return z;
|
1901 |
|
1902 |
} |
1903 |
|
1904 |
/*
|
1905 |
-------------------------------------------------------------------------------
|
1906 |
Returns the result of adding the absolute values of the double-precision
|
1907 |
floating-point values `a' and `b'. If `zSign' is true, the sum is negated
|
1908 |
before being returned. `zSign' is ignored if the result is a NaN. The
|
1909 |
addition is performed according to the IEC/IEEE Standard for Binary
|
1910 |
Floating-point Arithmetic.
|
1911 |
-------------------------------------------------------------------------------
|
1912 |
*/
|
1913 |
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
|
1914 |
{ |
1915 |
int16 aExp, bExp, zExp; |
1916 |
bits64 aSig, bSig, zSig; |
1917 |
int16 expDiff; |
1918 |
|
1919 |
aSig = extractFloat64Frac( a ); |
1920 |
aExp = extractFloat64Exp( a ); |
1921 |
bSig = extractFloat64Frac( b ); |
1922 |
bExp = extractFloat64Exp( b ); |
1923 |
expDiff = aExp - bExp; |
1924 |
aSig <<= 9;
|
1925 |
bSig <<= 9;
|
1926 |
if ( 0 < expDiff ) { |
1927 |
if ( aExp == 0x7FF ) { |
1928 |
if ( aSig ) return propagateFloat64NaN( a, b ); |
1929 |
return a;
|
1930 |
} |
1931 |
if ( bExp == 0 ) { |
1932 |
--expDiff; |
1933 |
} |
1934 |
else {
|
1935 |
bSig |= LIT64( 0x2000000000000000 );
|
1936 |
} |
1937 |
shift64RightJamming( bSig, expDiff, &bSig ); |
1938 |
zExp = aExp; |
1939 |
} |
1940 |
else if ( expDiff < 0 ) { |
1941 |
if ( bExp == 0x7FF ) { |
1942 |
if ( bSig ) return propagateFloat64NaN( a, b ); |
1943 |
return packFloat64( zSign, 0x7FF, 0 ); |
1944 |
} |
1945 |
if ( aExp == 0 ) { |
1946 |
++expDiff; |
1947 |
} |
1948 |
else {
|
1949 |
aSig |= LIT64( 0x2000000000000000 );
|
1950 |
} |
1951 |
shift64RightJamming( aSig, - expDiff, &aSig ); |
1952 |
zExp = bExp; |
1953 |
} |
1954 |
else {
|
1955 |
if ( aExp == 0x7FF ) { |
1956 |
if ( aSig | bSig ) return propagateFloat64NaN( a, b ); |
1957 |
return a;
|
1958 |
} |
1959 |
if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); |
1960 |
zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
|
1961 |
zExp = aExp; |
1962 |
goto roundAndPack;
|
1963 |
} |
1964 |
aSig |= LIT64( 0x2000000000000000 );
|
1965 |
zSig = ( aSig + bSig )<<1;
|
1966 |
--zExp; |
1967 |
if ( (sbits64) zSig < 0 ) { |
1968 |
zSig = aSig + bSig; |
1969 |
++zExp; |
1970 |
} |
1971 |
roundAndPack:
|
1972 |
return roundAndPackFloat64( zSign, zExp, zSig );
|
1973 |
|
1974 |
} |
1975 |
|
1976 |
/*
|
1977 |
-------------------------------------------------------------------------------
|
1978 |
Returns the result of subtracting the absolute values of the double-
|
1979 |
precision floating-point values `a' and `b'. If `zSign' is true, the
|
1980 |
difference is negated before being returned. `zSign' is ignored if the
|
1981 |
result is a NaN. The subtraction is performed according to the IEC/IEEE
|
1982 |
Standard for Binary Floating-point Arithmetic.
|
1983 |
-------------------------------------------------------------------------------
|
1984 |
*/
|
1985 |
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
|
1986 |
{ |
1987 |
int16 aExp, bExp, zExp; |
1988 |
bits64 aSig, bSig, zSig; |
1989 |
int16 expDiff; |
1990 |
|
1991 |
aSig = extractFloat64Frac( a ); |
1992 |
aExp = extractFloat64Exp( a ); |
1993 |
bSig = extractFloat64Frac( b ); |
1994 |
bExp = extractFloat64Exp( b ); |
1995 |
expDiff = aExp - bExp; |
1996 |
aSig <<= 10;
|
1997 |
bSig <<= 10;
|
1998 |
if ( 0 < expDiff ) goto aExpBigger; |
1999 |
if ( expDiff < 0 ) goto bExpBigger; |
2000 |
if ( aExp == 0x7FF ) { |
2001 |
if ( aSig | bSig ) return propagateFloat64NaN( a, b ); |
2002 |
float_raise( float_flag_invalid ); |
2003 |
return float64_default_nan;
|
2004 |
} |
2005 |
if ( aExp == 0 ) { |
2006 |
aExp = 1;
|
2007 |
bExp = 1;
|
2008 |
} |
2009 |
if ( bSig < aSig ) goto aBigger; |
2010 |
if ( aSig < bSig ) goto bBigger; |
2011 |
return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); |
2012 |
bExpBigger:
|
2013 |
if ( bExp == 0x7FF ) { |
2014 |
if ( bSig ) return propagateFloat64NaN( a, b ); |
2015 |
return packFloat64( zSign ^ 1, 0x7FF, 0 ); |
2016 |
} |
2017 |
if ( aExp == 0 ) { |
2018 |
++expDiff; |
2019 |
} |
2020 |
else {
|
2021 |
aSig |= LIT64( 0x4000000000000000 );
|
2022 |
} |
2023 |
shift64RightJamming( aSig, - expDiff, &aSig ); |
2024 |
bSig |= LIT64( 0x4000000000000000 );
|
2025 |
bBigger:
|
2026 |
zSig = bSig - aSig; |
2027 |
zExp = bExp; |
2028 |
zSign ^= 1;
|
2029 |
goto normalizeRoundAndPack;
|
2030 |
aExpBigger:
|
2031 |
if ( aExp == 0x7FF ) { |
2032 |
if ( aSig ) return propagateFloat64NaN( a, b ); |
2033 |
return a;
|
2034 |
} |
2035 |
if ( bExp == 0 ) { |
2036 |
--expDiff; |
2037 |
} |
2038 |
else {
|
2039 |
bSig |= LIT64( 0x4000000000000000 );
|
2040 |
} |
2041 |
shift64RightJamming( bSig, expDiff, &bSig ); |
2042 |
aSig |= LIT64( 0x4000000000000000 );
|
2043 |
aBigger:
|
2044 |
zSig = aSig - bSig; |
2045 |
zExp = aExp; |
2046 |
normalizeRoundAndPack:
|
2047 |
--zExp; |
2048 |
return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
|
2049 |
|
2050 |
} |
2051 |
|
2052 |
/*
|
2053 |
-------------------------------------------------------------------------------
|
2054 |
Returns the result of adding the double-precision floating-point values `a'
|
2055 |
and `b'. The operation is performed according to the IEC/IEEE Standard for
|
2056 |
Binary Floating-point Arithmetic.
|
2057 |
-------------------------------------------------------------------------------
|
2058 |
*/
|
2059 |
float64 float64_add( float64 a, float64 b ) |
2060 |
{ |
2061 |
flag aSign, bSign; |
2062 |
|
2063 |
aSign = extractFloat64Sign( a ); |
2064 |
bSign = extractFloat64Sign( b ); |
2065 |
if ( aSign == bSign ) {
|
2066 |
return addFloat64Sigs( a, b, aSign );
|
2067 |
} |
2068 |
else {
|
2069 |
return subFloat64Sigs( a, b, aSign );
|
2070 |
} |
2071 |
|
2072 |
} |
2073 |
|
2074 |
/*
|
2075 |
-------------------------------------------------------------------------------
|
2076 |
Returns the result of subtracting the double-precision floating-point values
|
2077 |
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
|
2078 |
for Binary Floating-point Arithmetic.
|
2079 |
-------------------------------------------------------------------------------
|
2080 |
*/
|
2081 |
float64 float64_sub( float64 a, float64 b ) |
2082 |
{ |
2083 |
flag aSign, bSign; |
2084 |
|
2085 |
aSign = extractFloat64Sign( a ); |
2086 |
bSign = extractFloat64Sign( b ); |
2087 |
if ( aSign == bSign ) {
|
2088 |
return subFloat64Sigs( a, b, aSign );
|
2089 |
} |
2090 |
else {
|
2091 |
return addFloat64Sigs( a, b, aSign );
|
2092 |
} |
2093 |
|
2094 |
} |
2095 |
|
2096 |
/*
|
2097 |
-------------------------------------------------------------------------------
|
2098 |
Returns the result of multiplying the double-precision floating-point values
|
2099 |
`a' and `b'. The operation is performed according to the IEC/IEEE Standard
|
2100 |
for Binary Floating-point Arithmetic.
|
2101 |
-------------------------------------------------------------------------------
|
2102 |
*/
|
2103 |
float64 float64_mul( float64 a, float64 b ) |
2104 |
{ |
2105 |
flag aSign, bSign, zSign; |
2106 |
int16 aExp, bExp, zExp; |
2107 |
bits64 aSig, bSig, zSig0, zSig1; |
2108 |
|
2109 |
aSig = extractFloat64Frac( a ); |
2110 |
aExp = extractFloat64Exp( a ); |
2111 |
aSign = extractFloat64Sign( a ); |
2112 |
bSig = extractFloat64Frac( b ); |
2113 |
bExp = extractFloat64Exp( b ); |
2114 |
bSign = extractFloat64Sign( b ); |
2115 |
zSign = aSign ^ bSign; |
2116 |
if ( aExp == 0x7FF ) { |
2117 |
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { |
2118 |
return propagateFloat64NaN( a, b );
|
2119 |
} |
2120 |
if ( ( bExp | bSig ) == 0 ) { |
2121 |
float_raise( float_flag_invalid ); |
2122 |
return float64_default_nan;
|
2123 |
} |
2124 |
return packFloat64( zSign, 0x7FF, 0 ); |
2125 |
} |
2126 |
if ( bExp == 0x7FF ) { |
2127 |
if ( bSig ) return propagateFloat64NaN( a, b ); |
2128 |
if ( ( aExp | aSig ) == 0 ) { |
2129 |
float_raise( float_flag_invalid ); |
2130 |
return float64_default_nan;
|
2131 |
} |
2132 |
return packFloat64( zSign, 0x7FF, 0 ); |
2133 |
} |
2134 |
if ( aExp == 0 ) { |
2135 |
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); |
2136 |
normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
2137 |
} |
2138 |
if ( bExp == 0 ) { |
2139 |
if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); |
2140 |
normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
2141 |
} |
2142 |
zExp = aExp + bExp - 0x3FF;
|
2143 |
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; |
2144 |
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
2145 |
mul64To128( aSig, bSig, &zSig0, &zSig1 ); |
2146 |
zSig0 |= ( zSig1 != 0 );
|
2147 |
if ( 0 <= (sbits64) ( zSig0<<1 ) ) { |
2148 |
zSig0 <<= 1;
|
2149 |
--zExp; |
2150 |
} |
2151 |
return roundAndPackFloat64( zSign, zExp, zSig0 );
|
2152 |
|
2153 |
} |
2154 |
|
2155 |
/*
|
2156 |
-------------------------------------------------------------------------------
|
2157 |
Returns the result of dividing the double-precision floating-point value `a'
|
2158 |
by the corresponding value `b'. The operation is performed according to
|
2159 |
the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2160 |
-------------------------------------------------------------------------------
|
2161 |
*/
|
2162 |
float64 float64_div( float64 a, float64 b ) |
2163 |
{ |
2164 |
flag aSign, bSign, zSign; |
2165 |
int16 aExp, bExp, zExp; |
2166 |
bits64 aSig, bSig, zSig; |
2167 |
bits64 rem0, rem1; |
2168 |
bits64 term0, term1; |
2169 |
|
2170 |
aSig = extractFloat64Frac( a ); |
2171 |
aExp = extractFloat64Exp( a ); |
2172 |
aSign = extractFloat64Sign( a ); |
2173 |
bSig = extractFloat64Frac( b ); |
2174 |
bExp = extractFloat64Exp( b ); |
2175 |
bSign = extractFloat64Sign( b ); |
2176 |
zSign = aSign ^ bSign; |
2177 |
if ( aExp == 0x7FF ) { |
2178 |
if ( aSig ) return propagateFloat64NaN( a, b ); |
2179 |
if ( bExp == 0x7FF ) { |
2180 |
if ( bSig ) return propagateFloat64NaN( a, b ); |
2181 |
float_raise( float_flag_invalid ); |
2182 |
return float64_default_nan;
|
2183 |
} |
2184 |
return packFloat64( zSign, 0x7FF, 0 ); |
2185 |
} |
2186 |
if ( bExp == 0x7FF ) { |
2187 |
if ( bSig ) return propagateFloat64NaN( a, b ); |
2188 |
return packFloat64( zSign, 0, 0 ); |
2189 |
} |
2190 |
if ( bExp == 0 ) { |
2191 |
if ( bSig == 0 ) { |
2192 |
if ( ( aExp | aSig ) == 0 ) { |
2193 |
float_raise( float_flag_invalid ); |
2194 |
return float64_default_nan;
|
2195 |
} |
2196 |
float_raise( float_flag_divbyzero ); |
2197 |
return packFloat64( zSign, 0x7FF, 0 ); |
2198 |
} |
2199 |
normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
2200 |
} |
2201 |
if ( aExp == 0 ) { |
2202 |
if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); |
2203 |
normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
2204 |
} |
2205 |
zExp = aExp - bExp + 0x3FD;
|
2206 |
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; |
2207 |
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
2208 |
if ( bSig <= ( aSig + aSig ) ) {
|
2209 |
aSig >>= 1;
|
2210 |
++zExp; |
2211 |
} |
2212 |
zSig = estimateDiv128To64( aSig, 0, bSig );
|
2213 |
if ( ( zSig & 0x1FF ) <= 2 ) { |
2214 |
mul64To128( bSig, zSig, &term0, &term1 ); |
2215 |
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
|
2216 |
while ( (sbits64) rem0 < 0 ) { |
2217 |
--zSig; |
2218 |
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
|
2219 |
} |
2220 |
zSig |= ( rem1 != 0 );
|
2221 |
} |
2222 |
return roundAndPackFloat64( zSign, zExp, zSig );
|
2223 |
|
2224 |
} |
2225 |
|
2226 |
/*
|
2227 |
-------------------------------------------------------------------------------
|
2228 |
Returns the remainder of the double-precision floating-point value `a'
|
2229 |
with respect to the corresponding value `b'. The operation is performed
|
2230 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2231 |
-------------------------------------------------------------------------------
|
2232 |
*/
|
2233 |
float64 float64_rem( float64 a, float64 b ) |
2234 |
{ |
2235 |
flag aSign, bSign, zSign; |
2236 |
int16 aExp, bExp, expDiff; |
2237 |
bits64 aSig, bSig; |
2238 |
bits64 q, alternateASig; |
2239 |
sbits64 sigMean; |
2240 |
|
2241 |
aSig = extractFloat64Frac( a ); |
2242 |
aExp = extractFloat64Exp( a ); |
2243 |
aSign = extractFloat64Sign( a ); |
2244 |
bSig = extractFloat64Frac( b ); |
2245 |
bExp = extractFloat64Exp( b ); |
2246 |
bSign = extractFloat64Sign( b ); |
2247 |
if ( aExp == 0x7FF ) { |
2248 |
if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { |
2249 |
return propagateFloat64NaN( a, b );
|
2250 |
} |
2251 |
float_raise( float_flag_invalid ); |
2252 |
return float64_default_nan;
|
2253 |
} |
2254 |
if ( bExp == 0x7FF ) { |
2255 |
if ( bSig ) return propagateFloat64NaN( a, b ); |
2256 |
return a;
|
2257 |
} |
2258 |
if ( bExp == 0 ) { |
2259 |
if ( bSig == 0 ) { |
2260 |
float_raise( float_flag_invalid ); |
2261 |
return float64_default_nan;
|
2262 |
} |
2263 |
normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
2264 |
} |
2265 |
if ( aExp == 0 ) { |
2266 |
if ( aSig == 0 ) return a; |
2267 |
normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
2268 |
} |
2269 |
expDiff = aExp - bExp; |
2270 |
aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; |
2271 |
bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
2272 |
if ( expDiff < 0 ) { |
2273 |
if ( expDiff < -1 ) return a; |
2274 |
aSig >>= 1;
|
2275 |
} |
2276 |
q = ( bSig <= aSig ); |
2277 |
if ( q ) aSig -= bSig;
|
2278 |
expDiff -= 64;
|
2279 |
while ( 0 < expDiff ) { |
2280 |
q = estimateDiv128To64( aSig, 0, bSig );
|
2281 |
q = ( 2 < q ) ? q - 2 : 0; |
2282 |
aSig = - ( ( bSig>>2 ) * q );
|
2283 |
expDiff -= 62;
|
2284 |
} |
2285 |
expDiff += 64;
|
2286 |
if ( 0 < expDiff ) { |
2287 |
q = estimateDiv128To64( aSig, 0, bSig );
|
2288 |
q = ( 2 < q ) ? q - 2 : 0; |
2289 |
q >>= 64 - expDiff;
|
2290 |
bSig >>= 2;
|
2291 |
aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; |
2292 |
} |
2293 |
else {
|
2294 |
aSig >>= 2;
|
2295 |
bSig >>= 2;
|
2296 |
} |
2297 |
do {
|
2298 |
alternateASig = aSig; |
2299 |
++q; |
2300 |
aSig -= bSig; |
2301 |
} while ( 0 <= (sbits64) aSig ); |
2302 |
sigMean = aSig + alternateASig; |
2303 |
if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { |
2304 |
aSig = alternateASig; |
2305 |
} |
2306 |
zSign = ( (sbits64) aSig < 0 );
|
2307 |
if ( zSign ) aSig = - aSig;
|
2308 |
return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
|
2309 |
|
2310 |
} |
2311 |
|
2312 |
/*
|
2313 |
-------------------------------------------------------------------------------
|
2314 |
Returns the square root of the double-precision floating-point value `a'.
|
2315 |
The operation is performed according to the IEC/IEEE Standard for Binary
|
2316 |
Floating-point Arithmetic.
|
2317 |
-------------------------------------------------------------------------------
|
2318 |
*/
|
2319 |
float64 float64_sqrt( float64 a ) |
2320 |
{ |
2321 |
flag aSign; |
2322 |
int16 aExp, zExp; |
2323 |
bits64 aSig, zSig; |
2324 |
bits64 rem0, rem1, term0, term1; //, shiftedRem;
|
2325 |
//float64 z;
|
2326 |
|
2327 |
aSig = extractFloat64Frac( a ); |
2328 |
aExp = extractFloat64Exp( a ); |
2329 |
aSign = extractFloat64Sign( a ); |
2330 |
if ( aExp == 0x7FF ) { |
2331 |
if ( aSig ) return propagateFloat64NaN( a, a ); |
2332 |
if ( ! aSign ) return a; |
2333 |
float_raise( float_flag_invalid ); |
2334 |
return float64_default_nan;
|
2335 |
} |
2336 |
if ( aSign ) {
|
2337 |
if ( ( aExp | aSig ) == 0 ) return a; |
2338 |
float_raise( float_flag_invalid ); |
2339 |
return float64_default_nan;
|
2340 |
} |
2341 |
if ( aExp == 0 ) { |
2342 |
if ( aSig == 0 ) return 0; |
2343 |
normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
2344 |
} |
2345 |
zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; |
2346 |
aSig |= LIT64( 0x0010000000000000 );
|
2347 |
zSig = estimateSqrt32( aExp, aSig>>21 );
|
2348 |
zSig <<= 31;
|
2349 |
aSig <<= 9 - ( aExp & 1 ); |
2350 |
zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2; |
2351 |
if ( ( zSig & 0x3FF ) <= 5 ) { |
2352 |
if ( zSig < 2 ) { |
2353 |
zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
|
2354 |
} |
2355 |
else {
|
2356 |
aSig <<= 2;
|
2357 |
mul64To128( zSig, zSig, &term0, &term1 ); |
2358 |
sub128( aSig, 0, term0, term1, &rem0, &rem1 );
|
2359 |
while ( (sbits64) rem0 < 0 ) { |
2360 |
--zSig; |
2361 |
shortShift128Left( 0, zSig, 1, &term0, &term1 ); |
2362 |
term1 |= 1;
|
2363 |
add128( rem0, rem1, term0, term1, &rem0, &rem1 ); |
2364 |
} |
2365 |
zSig |= ( ( rem0 | rem1 ) != 0 );
|
2366 |
} |
2367 |
} |
2368 |
shift64RightJamming( zSig, 1, &zSig );
|
2369 |
return roundAndPackFloat64( 0, zExp, zSig ); |
2370 |
|
2371 |
} |
2372 |
|
2373 |
/*
|
2374 |
-------------------------------------------------------------------------------
|
2375 |
Returns 1 if the double-precision floating-point value `a' is equal to the
|
2376 |
corresponding value `b', and 0 otherwise. The comparison is performed
|
2377 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2378 |
-------------------------------------------------------------------------------
|
2379 |
*/
|
2380 |
flag float64_eq( float64 a, float64 b ) |
2381 |
{ |
2382 |
|
2383 |
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
2384 |
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
|
2385 |
) { |
2386 |
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
|
2387 |
float_raise( float_flag_invalid ); |
2388 |
} |
2389 |
return 0; |
2390 |
} |
2391 |
return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); |
2392 |
|
2393 |
} |
2394 |
|
2395 |
/*
|
2396 |
-------------------------------------------------------------------------------
|
2397 |
Returns 1 if the double-precision floating-point value `a' is less than or
|
2398 |
equal to the corresponding value `b', and 0 otherwise. The comparison is
|
2399 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
2400 |
Arithmetic.
|
2401 |
-------------------------------------------------------------------------------
|
2402 |
*/
|
2403 |
flag float64_le( float64 a, float64 b ) |
2404 |
{ |
2405 |
flag aSign, bSign; |
2406 |
|
2407 |
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
2408 |
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
|
2409 |
) { |
2410 |
float_raise( float_flag_invalid ); |
2411 |
return 0; |
2412 |
} |
2413 |
aSign = extractFloat64Sign( a ); |
2414 |
bSign = extractFloat64Sign( b ); |
2415 |
if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); |
2416 |
return ( a == b ) || ( aSign ^ ( a < b ) );
|
2417 |
|
2418 |
} |
2419 |
|
2420 |
/*
|
2421 |
-------------------------------------------------------------------------------
|
2422 |
Returns 1 if the double-precision floating-point value `a' is less than
|
2423 |
the corresponding value `b', and 0 otherwise. The comparison is performed
|
2424 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2425 |
-------------------------------------------------------------------------------
|
2426 |
*/
|
2427 |
flag float64_lt( float64 a, float64 b ) |
2428 |
{ |
2429 |
flag aSign, bSign; |
2430 |
|
2431 |
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
2432 |
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
|
2433 |
) { |
2434 |
float_raise( float_flag_invalid ); |
2435 |
return 0; |
2436 |
} |
2437 |
aSign = extractFloat64Sign( a ); |
2438 |
bSign = extractFloat64Sign( b ); |
2439 |
if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); |
2440 |
return ( a != b ) && ( aSign ^ ( a < b ) );
|
2441 |
|
2442 |
} |
2443 |
|
2444 |
/*
|
2445 |
-------------------------------------------------------------------------------
|
2446 |
Returns 1 if the double-precision floating-point value `a' is equal to the
|
2447 |
corresponding value `b', and 0 otherwise. The invalid exception is raised
|
2448 |
if either operand is a NaN. Otherwise, the comparison is performed
|
2449 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2450 |
-------------------------------------------------------------------------------
|
2451 |
*/
|
2452 |
flag float64_eq_signaling( float64 a, float64 b ) |
2453 |
{ |
2454 |
|
2455 |
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
2456 |
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
|
2457 |
) { |
2458 |
float_raise( float_flag_invalid ); |
2459 |
return 0; |
2460 |
} |
2461 |
return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); |
2462 |
|
2463 |
} |
2464 |
|
2465 |
/*
|
2466 |
-------------------------------------------------------------------------------
|
2467 |
Returns 1 if the double-precision floating-point value `a' is less than or
|
2468 |
equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
|
2469 |
cause an exception. Otherwise, the comparison is performed according to the
|
2470 |
IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2471 |
-------------------------------------------------------------------------------
|
2472 |
*/
|
2473 |
flag float64_le_quiet( float64 a, float64 b ) |
2474 |
{ |
2475 |
flag aSign, bSign; |
2476 |
//int16 aExp, bExp;
|
2477 |
|
2478 |
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
2479 |
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
|
2480 |
) { |
2481 |
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
|
2482 |
float_raise( float_flag_invalid ); |
2483 |
} |
2484 |
return 0; |
2485 |
} |
2486 |
aSign = extractFloat64Sign( a ); |
2487 |
bSign = extractFloat64Sign( b ); |
2488 |
if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); |
2489 |
return ( a == b ) || ( aSign ^ ( a < b ) );
|
2490 |
|
2491 |
} |
2492 |
|
2493 |
/*
|
2494 |
-------------------------------------------------------------------------------
|
2495 |
Returns 1 if the double-precision floating-point value `a' is less than
|
2496 |
the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
|
2497 |
exception. Otherwise, the comparison is performed according to the IEC/IEEE
|
2498 |
Standard for Binary Floating-point Arithmetic.
|
2499 |
-------------------------------------------------------------------------------
|
2500 |
*/
|
2501 |
flag float64_lt_quiet( float64 a, float64 b ) |
2502 |
{ |
2503 |
flag aSign, bSign; |
2504 |
|
2505 |
if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
2506 |
|| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
|
2507 |
) { |
2508 |
if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
|
2509 |
float_raise( float_flag_invalid ); |
2510 |
} |
2511 |
return 0; |
2512 |
} |
2513 |
aSign = extractFloat64Sign( a ); |
2514 |
bSign = extractFloat64Sign( b ); |
2515 |
if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); |
2516 |
return ( a != b ) && ( aSign ^ ( a < b ) );
|
2517 |
|
2518 |
} |
2519 |
|
2520 |
#ifdef FLOATX80
|
2521 |
|
2522 |
/*
|
2523 |
-------------------------------------------------------------------------------
|
2524 |
Returns the result of converting the extended double-precision floating-
|
2525 |
point value `a' to the 32-bit two's complement integer format. The
|
2526 |
conversion is performed according to the IEC/IEEE Standard for Binary
|
2527 |
Floating-point Arithmetic---which means in particular that the conversion
|
2528 |
is rounded according to the current rounding mode. If `a' is a NaN, the
|
2529 |
largest positive integer is returned. Otherwise, if the conversion
|
2530 |
overflows, the largest integer with the same sign as `a' is returned.
|
2531 |
-------------------------------------------------------------------------------
|
2532 |
*/
|
2533 |
int32 floatx80_to_int32( floatx80 a ) |
2534 |
{ |
2535 |
flag aSign; |
2536 |
int32 aExp, shiftCount; |
2537 |
bits64 aSig; |
2538 |
|
2539 |
aSig = extractFloatx80Frac( a ); |
2540 |
aExp = extractFloatx80Exp( a ); |
2541 |
aSign = extractFloatx80Sign( a ); |
2542 |
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; |
2543 |
shiftCount = 0x4037 - aExp;
|
2544 |
if ( shiftCount <= 0 ) shiftCount = 1; |
2545 |
shift64RightJamming( aSig, shiftCount, &aSig ); |
2546 |
return roundAndPackInt32( aSign, aSig );
|
2547 |
|
2548 |
} |
2549 |
|
2550 |
/*
|
2551 |
-------------------------------------------------------------------------------
|
2552 |
Returns the result of converting the extended double-precision floating-
|
2553 |
point value `a' to the 32-bit two's complement integer format. The
|
2554 |
conversion is performed according to the IEC/IEEE Standard for Binary
|
2555 |
Floating-point Arithmetic, except that the conversion is always rounded
|
2556 |
toward zero. If `a' is a NaN, the largest positive integer is returned.
|
2557 |
Otherwise, if the conversion overflows, the largest integer with the same
|
2558 |
sign as `a' is returned.
|
2559 |
-------------------------------------------------------------------------------
|
2560 |
*/
|
2561 |
int32 floatx80_to_int32_round_to_zero( floatx80 a ) |
2562 |
{ |
2563 |
flag aSign; |
2564 |
int32 aExp, shiftCount; |
2565 |
bits64 aSig, savedASig; |
2566 |
int32 z; |
2567 |
|
2568 |
aSig = extractFloatx80Frac( a ); |
2569 |
aExp = extractFloatx80Exp( a ); |
2570 |
aSign = extractFloatx80Sign( a ); |
2571 |
shiftCount = 0x403E - aExp;
|
2572 |
if ( shiftCount < 32 ) { |
2573 |
if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; |
2574 |
goto invalid;
|
2575 |
} |
2576 |
else if ( 63 < shiftCount ) { |
2577 |
if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
|
2578 |
return 0; |
2579 |
} |
2580 |
savedASig = aSig; |
2581 |
aSig >>= shiftCount; |
2582 |
z = aSig; |
2583 |
if ( aSign ) z = - z;
|
2584 |
if ( ( z < 0 ) ^ aSign ) { |
2585 |
invalid:
|
2586 |
float_exception_flags |= float_flag_invalid; |
2587 |
return aSign ? 0x80000000 : 0x7FFFFFFF; |
2588 |
} |
2589 |
if ( ( aSig<<shiftCount ) != savedASig ) {
|
2590 |
float_exception_flags |= float_flag_inexact; |
2591 |
} |
2592 |
return z;
|
2593 |
|
2594 |
} |
2595 |
|
2596 |
/*
|
2597 |
-------------------------------------------------------------------------------
|
2598 |
Returns the result of converting the extended double-precision floating-
|
2599 |
point value `a' to the single-precision floating-point format. The
|
2600 |
conversion is performed according to the IEC/IEEE Standard for Binary
|
2601 |
Floating-point Arithmetic.
|
2602 |
-------------------------------------------------------------------------------
|
2603 |
*/
|
2604 |
float32 floatx80_to_float32( floatx80 a ) |
2605 |
{ |
2606 |
flag aSign; |
2607 |
int32 aExp; |
2608 |
bits64 aSig; |
2609 |
|
2610 |
aSig = extractFloatx80Frac( a ); |
2611 |
aExp = extractFloatx80Exp( a ); |
2612 |
aSign = extractFloatx80Sign( a ); |
2613 |
if ( aExp == 0x7FFF ) { |
2614 |
if ( (bits64) ( aSig<<1 ) ) { |
2615 |
return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
|
2616 |
} |
2617 |
return packFloat32( aSign, 0xFF, 0 ); |
2618 |
} |
2619 |
shift64RightJamming( aSig, 33, &aSig );
|
2620 |
if ( aExp || aSig ) aExp -= 0x3F81; |
2621 |
return roundAndPackFloat32( aSign, aExp, aSig );
|
2622 |
|
2623 |
} |
2624 |
|
2625 |
/*
|
2626 |
-------------------------------------------------------------------------------
|
2627 |
Returns the result of converting the extended double-precision floating-
|
2628 |
point value `a' to the double-precision floating-point format. The
|
2629 |
conversion is performed according to the IEC/IEEE Standard for Binary
|
2630 |
Floating-point Arithmetic.
|
2631 |
-------------------------------------------------------------------------------
|
2632 |
*/
|
2633 |
float64 floatx80_to_float64( floatx80 a ) |
2634 |
{ |
2635 |
flag aSign; |
2636 |
int32 aExp; |
2637 |
bits64 aSig, zSig; |
2638 |
|
2639 |
aSig = extractFloatx80Frac( a ); |
2640 |
aExp = extractFloatx80Exp( a ); |
2641 |
aSign = extractFloatx80Sign( a ); |
2642 |
if ( aExp == 0x7FFF ) { |
2643 |
if ( (bits64) ( aSig<<1 ) ) { |
2644 |
return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
|
2645 |
} |
2646 |
return packFloat64( aSign, 0x7FF, 0 ); |
2647 |
} |
2648 |
shift64RightJamming( aSig, 1, &zSig );
|
2649 |
if ( aExp || aSig ) aExp -= 0x3C01; |
2650 |
return roundAndPackFloat64( aSign, aExp, zSig );
|
2651 |
|
2652 |
} |
2653 |
|
2654 |
/*
|
2655 |
-------------------------------------------------------------------------------
|
2656 |
Rounds the extended double-precision floating-point value `a' to an integer,
|
2657 |
and returns the result as an extended quadruple-precision floating-point
|
2658 |
value. The operation is performed according to the IEC/IEEE Standard for
|
2659 |
Binary Floating-point Arithmetic.
|
2660 |
-------------------------------------------------------------------------------
|
2661 |
*/
|
2662 |
floatx80 floatx80_round_to_int( floatx80 a ) |
2663 |
{ |
2664 |
flag aSign; |
2665 |
int32 aExp; |
2666 |
bits64 lastBitMask, roundBitsMask; |
2667 |
int8 roundingMode; |
2668 |
floatx80 z; |
2669 |
|
2670 |
aExp = extractFloatx80Exp( a ); |
2671 |
if ( 0x403E <= aExp ) { |
2672 |
if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { |
2673 |
return propagateFloatx80NaN( a, a );
|
2674 |
} |
2675 |
return a;
|
2676 |
} |
2677 |
if ( aExp <= 0x3FFE ) { |
2678 |
if ( ( aExp == 0 ) |
2679 |
&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { |
2680 |
return a;
|
2681 |
} |
2682 |
float_exception_flags |= float_flag_inexact; |
2683 |
aSign = extractFloatx80Sign( a ); |
2684 |
switch ( float_rounding_mode ) {
|
2685 |
case float_round_nearest_even:
|
2686 |
if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) |
2687 |
) { |
2688 |
return
|
2689 |
packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); |
2690 |
} |
2691 |
break;
|
2692 |
case float_round_down:
|
2693 |
return
|
2694 |
aSign ? |
2695 |
packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) |
2696 |
: packFloatx80( 0, 0, 0 ); |
2697 |
case float_round_up:
|
2698 |
return
|
2699 |
aSign ? packFloatx80( 1, 0, 0 ) |
2700 |
: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); |
2701 |
} |
2702 |
return packFloatx80( aSign, 0, 0 ); |
2703 |
} |
2704 |
lastBitMask = 1;
|
2705 |
lastBitMask <<= 0x403E - aExp;
|
2706 |
roundBitsMask = lastBitMask - 1;
|
2707 |
z = a; |
2708 |
roundingMode = float_rounding_mode; |
2709 |
if ( roundingMode == float_round_nearest_even ) {
|
2710 |
z.low += lastBitMask>>1;
|
2711 |
if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; |
2712 |
} |
2713 |
else if ( roundingMode != float_round_to_zero ) { |
2714 |
if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
|
2715 |
z.low += roundBitsMask; |
2716 |
} |
2717 |
} |
2718 |
z.low &= ~ roundBitsMask; |
2719 |
if ( z.low == 0 ) { |
2720 |
++z.high; |
2721 |
z.low = LIT64( 0x8000000000000000 );
|
2722 |
} |
2723 |
if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
|
2724 |
return z;
|
2725 |
|
2726 |
} |
2727 |
|
2728 |
/*
|
2729 |
-------------------------------------------------------------------------------
|
2730 |
Returns the result of adding the absolute values of the extended double-
|
2731 |
precision floating-point values `a' and `b'. If `zSign' is true, the sum is
|
2732 |
negated before being returned. `zSign' is ignored if the result is a NaN.
|
2733 |
The addition is performed according to the IEC/IEEE Standard for Binary
|
2734 |
Floating-point Arithmetic.
|
2735 |
-------------------------------------------------------------------------------
|
2736 |
*/
|
2737 |
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
|
2738 |
{ |
2739 |
int32 aExp, bExp, zExp; |
2740 |
bits64 aSig, bSig, zSig0, zSig1; |
2741 |
int32 expDiff; |
2742 |
|
2743 |
aSig = extractFloatx80Frac( a ); |
2744 |
aExp = extractFloatx80Exp( a ); |
2745 |
bSig = extractFloatx80Frac( b ); |
2746 |
bExp = extractFloatx80Exp( b ); |
2747 |
expDiff = aExp - bExp; |
2748 |
if ( 0 < expDiff ) { |
2749 |
if ( aExp == 0x7FFF ) { |
2750 |
if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2751 |
return a;
|
2752 |
} |
2753 |
if ( bExp == 0 ) --expDiff; |
2754 |
shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
|
2755 |
zExp = aExp; |
2756 |
} |
2757 |
else if ( expDiff < 0 ) { |
2758 |
if ( bExp == 0x7FFF ) { |
2759 |
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2760 |
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
2761 |
} |
2762 |
if ( aExp == 0 ) ++expDiff; |
2763 |
shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
|
2764 |
zExp = bExp; |
2765 |
} |
2766 |
else {
|
2767 |
if ( aExp == 0x7FFF ) { |
2768 |
if ( (bits64) ( ( aSig | bSig )<<1 ) ) { |
2769 |
return propagateFloatx80NaN( a, b );
|
2770 |
} |
2771 |
return a;
|
2772 |
} |
2773 |
zSig1 = 0;
|
2774 |
zSig0 = aSig + bSig; |
2775 |
if ( aExp == 0 ) { |
2776 |
normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); |
2777 |
goto roundAndPack;
|
2778 |
} |
2779 |
zExp = aExp; |
2780 |
goto shiftRight1;
|
2781 |
} |
2782 |
|
2783 |
zSig0 = aSig + bSig; |
2784 |
|
2785 |
if ( (sbits64) zSig0 < 0 ) goto roundAndPack; |
2786 |
shiftRight1:
|
2787 |
shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
|
2788 |
zSig0 |= LIT64( 0x8000000000000000 );
|
2789 |
++zExp; |
2790 |
roundAndPack:
|
2791 |
return
|
2792 |
roundAndPackFloatx80( |
2793 |
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
2794 |
|
2795 |
} |
2796 |
|
2797 |
/*
|
2798 |
-------------------------------------------------------------------------------
|
2799 |
Returns the result of subtracting the absolute values of the extended
|
2800 |
double-precision floating-point values `a' and `b'. If `zSign' is true,
|
2801 |
the difference is negated before being returned. `zSign' is ignored if the
|
2802 |
result is a NaN. The subtraction is performed according to the IEC/IEEE
|
2803 |
Standard for Binary Floating-point Arithmetic.
|
2804 |
-------------------------------------------------------------------------------
|
2805 |
*/
|
2806 |
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
|
2807 |
{ |
2808 |
int32 aExp, bExp, zExp; |
2809 |
bits64 aSig, bSig, zSig0, zSig1; |
2810 |
int32 expDiff; |
2811 |
floatx80 z; |
2812 |
|
2813 |
aSig = extractFloatx80Frac( a ); |
2814 |
aExp = extractFloatx80Exp( a ); |
2815 |
bSig = extractFloatx80Frac( b ); |
2816 |
bExp = extractFloatx80Exp( b ); |
2817 |
expDiff = aExp - bExp; |
2818 |
if ( 0 < expDiff ) goto aExpBigger; |
2819 |
if ( expDiff < 0 ) goto bExpBigger; |
2820 |
if ( aExp == 0x7FFF ) { |
2821 |
if ( (bits64) ( ( aSig | bSig )<<1 ) ) { |
2822 |
return propagateFloatx80NaN( a, b );
|
2823 |
} |
2824 |
float_raise( float_flag_invalid ); |
2825 |
z.low = floatx80_default_nan_low; |
2826 |
z.high = floatx80_default_nan_high; |
2827 |
return z;
|
2828 |
} |
2829 |
if ( aExp == 0 ) { |
2830 |
aExp = 1;
|
2831 |
bExp = 1;
|
2832 |
} |
2833 |
zSig1 = 0;
|
2834 |
if ( bSig < aSig ) goto aBigger; |
2835 |
if ( aSig < bSig ) goto bBigger; |
2836 |
return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); |
2837 |
bExpBigger:
|
2838 |
if ( bExp == 0x7FFF ) { |
2839 |
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2840 |
return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
2841 |
} |
2842 |
if ( aExp == 0 ) ++expDiff; |
2843 |
shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
|
2844 |
bBigger:
|
2845 |
sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
|
2846 |
zExp = bExp; |
2847 |
zSign ^= 1;
|
2848 |
goto normalizeRoundAndPack;
|
2849 |
aExpBigger:
|
2850 |
if ( aExp == 0x7FFF ) { |
2851 |
if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2852 |
return a;
|
2853 |
} |
2854 |
if ( bExp == 0 ) --expDiff; |
2855 |
shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
|
2856 |
aBigger:
|
2857 |
sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
|
2858 |
zExp = aExp; |
2859 |
normalizeRoundAndPack:
|
2860 |
return
|
2861 |
normalizeRoundAndPackFloatx80( |
2862 |
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
2863 |
|
2864 |
} |
2865 |
|
2866 |
/*
|
2867 |
-------------------------------------------------------------------------------
|
2868 |
Returns the result of adding the extended double-precision floating-point
|
2869 |
values `a' and `b'. The operation is performed according to the IEC/IEEE
|
2870 |
Standard for Binary Floating-point Arithmetic.
|
2871 |
-------------------------------------------------------------------------------
|
2872 |
*/
|
2873 |
floatx80 floatx80_add( floatx80 a, floatx80 b ) |
2874 |
{ |
2875 |
flag aSign, bSign; |
2876 |
|
2877 |
aSign = extractFloatx80Sign( a ); |
2878 |
bSign = extractFloatx80Sign( b ); |
2879 |
if ( aSign == bSign ) {
|
2880 |
return addFloatx80Sigs( a, b, aSign );
|
2881 |
} |
2882 |
else {
|
2883 |
return subFloatx80Sigs( a, b, aSign );
|
2884 |
} |
2885 |
|
2886 |
} |
2887 |
|
2888 |
/*
|
2889 |
-------------------------------------------------------------------------------
|
2890 |
Returns the result of subtracting the extended double-precision floating-
|
2891 |
point values `a' and `b'. The operation is performed according to the
|
2892 |
IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2893 |
-------------------------------------------------------------------------------
|
2894 |
*/
|
2895 |
floatx80 floatx80_sub( floatx80 a, floatx80 b ) |
2896 |
{ |
2897 |
flag aSign, bSign; |
2898 |
|
2899 |
aSign = extractFloatx80Sign( a ); |
2900 |
bSign = extractFloatx80Sign( b ); |
2901 |
if ( aSign == bSign ) {
|
2902 |
return subFloatx80Sigs( a, b, aSign );
|
2903 |
} |
2904 |
else {
|
2905 |
return addFloatx80Sigs( a, b, aSign );
|
2906 |
} |
2907 |
|
2908 |
} |
2909 |
|
2910 |
/*
|
2911 |
-------------------------------------------------------------------------------
|
2912 |
Returns the result of multiplying the extended double-precision floating-
|
2913 |
point values `a' and `b'. The operation is performed according to the
|
2914 |
IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2915 |
-------------------------------------------------------------------------------
|
2916 |
*/
|
2917 |
floatx80 floatx80_mul( floatx80 a, floatx80 b ) |
2918 |
{ |
2919 |
flag aSign, bSign, zSign; |
2920 |
int32 aExp, bExp, zExp; |
2921 |
bits64 aSig, bSig, zSig0, zSig1; |
2922 |
floatx80 z; |
2923 |
|
2924 |
aSig = extractFloatx80Frac( a ); |
2925 |
aExp = extractFloatx80Exp( a ); |
2926 |
aSign = extractFloatx80Sign( a ); |
2927 |
bSig = extractFloatx80Frac( b ); |
2928 |
bExp = extractFloatx80Exp( b ); |
2929 |
bSign = extractFloatx80Sign( b ); |
2930 |
zSign = aSign ^ bSign; |
2931 |
if ( aExp == 0x7FFF ) { |
2932 |
if ( (bits64) ( aSig<<1 ) |
2933 |
|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { |
2934 |
return propagateFloatx80NaN( a, b );
|
2935 |
} |
2936 |
if ( ( bExp | bSig ) == 0 ) goto invalid; |
2937 |
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
2938 |
} |
2939 |
if ( bExp == 0x7FFF ) { |
2940 |
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2941 |
if ( ( aExp | aSig ) == 0 ) { |
2942 |
invalid:
|
2943 |
float_raise( float_flag_invalid ); |
2944 |
z.low = floatx80_default_nan_low; |
2945 |
z.high = floatx80_default_nan_high; |
2946 |
return z;
|
2947 |
} |
2948 |
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
2949 |
} |
2950 |
if ( aExp == 0 ) { |
2951 |
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
2952 |
normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); |
2953 |
} |
2954 |
if ( bExp == 0 ) { |
2955 |
if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
2956 |
normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
2957 |
} |
2958 |
zExp = aExp + bExp - 0x3FFE;
|
2959 |
mul64To128( aSig, bSig, &zSig0, &zSig1 ); |
2960 |
if ( 0 < (sbits64) zSig0 ) { |
2961 |
shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
|
2962 |
--zExp; |
2963 |
} |
2964 |
return
|
2965 |
roundAndPackFloatx80( |
2966 |
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
2967 |
|
2968 |
} |
2969 |
|
2970 |
/*
|
2971 |
-------------------------------------------------------------------------------
|
2972 |
Returns the result of dividing the extended double-precision floating-point
|
2973 |
value `a' by the corresponding value `b'. The operation is performed
|
2974 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
2975 |
-------------------------------------------------------------------------------
|
2976 |
*/
|
2977 |
floatx80 floatx80_div( floatx80 a, floatx80 b ) |
2978 |
{ |
2979 |
flag aSign, bSign, zSign; |
2980 |
int32 aExp, bExp, zExp; |
2981 |
bits64 aSig, bSig, zSig0, zSig1; |
2982 |
bits64 rem0, rem1, rem2, term0, term1, term2; |
2983 |
floatx80 z; |
2984 |
|
2985 |
aSig = extractFloatx80Frac( a ); |
2986 |
aExp = extractFloatx80Exp( a ); |
2987 |
aSign = extractFloatx80Sign( a ); |
2988 |
bSig = extractFloatx80Frac( b ); |
2989 |
bExp = extractFloatx80Exp( b ); |
2990 |
bSign = extractFloatx80Sign( b ); |
2991 |
zSign = aSign ^ bSign; |
2992 |
if ( aExp == 0x7FFF ) { |
2993 |
if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2994 |
if ( bExp == 0x7FFF ) { |
2995 |
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
2996 |
goto invalid;
|
2997 |
} |
2998 |
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
2999 |
} |
3000 |
if ( bExp == 0x7FFF ) { |
3001 |
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3002 |
return packFloatx80( zSign, 0, 0 ); |
3003 |
} |
3004 |
if ( bExp == 0 ) { |
3005 |
if ( bSig == 0 ) { |
3006 |
if ( ( aExp | aSig ) == 0 ) { |
3007 |
invalid:
|
3008 |
float_raise( float_flag_invalid ); |
3009 |
z.low = floatx80_default_nan_low; |
3010 |
z.high = floatx80_default_nan_high; |
3011 |
return z;
|
3012 |
} |
3013 |
float_raise( float_flag_divbyzero ); |
3014 |
return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3015 |
} |
3016 |
normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
3017 |
} |
3018 |
if ( aExp == 0 ) { |
3019 |
if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
3020 |
normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); |
3021 |
} |
3022 |
zExp = aExp - bExp + 0x3FFE;
|
3023 |
rem1 = 0;
|
3024 |
if ( bSig <= aSig ) {
|
3025 |
shift128Right( aSig, 0, 1, &aSig, &rem1 ); |
3026 |
++zExp; |
3027 |
} |
3028 |
zSig0 = estimateDiv128To64( aSig, rem1, bSig ); |
3029 |
mul64To128( bSig, zSig0, &term0, &term1 ); |
3030 |
sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); |
3031 |
while ( (sbits64) rem0 < 0 ) { |
3032 |
--zSig0; |
3033 |
add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
|
3034 |
} |
3035 |
zSig1 = estimateDiv128To64( rem1, 0, bSig );
|
3036 |
if ( (bits64) ( zSig1<<1 ) <= 8 ) { |
3037 |
mul64To128( bSig, zSig1, &term1, &term2 ); |
3038 |
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
|
3039 |
while ( (sbits64) rem1 < 0 ) { |
3040 |
--zSig1; |
3041 |
add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
|
3042 |
} |
3043 |
zSig1 |= ( ( rem1 | rem2 ) != 0 );
|
3044 |
} |
3045 |
return
|
3046 |
roundAndPackFloatx80( |
3047 |
floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
3048 |
|
3049 |
} |
3050 |
|
3051 |
/*
|
3052 |
-------------------------------------------------------------------------------
|
3053 |
Returns the remainder of the extended double-precision floating-point value
|
3054 |
`a' with respect to the corresponding value `b'. The operation is performed
|
3055 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
3056 |
-------------------------------------------------------------------------------
|
3057 |
*/
|
3058 |
floatx80 floatx80_rem( floatx80 a, floatx80 b ) |
3059 |
{ |
3060 |
flag aSign, bSign, zSign; |
3061 |
int32 aExp, bExp, expDiff; |
3062 |
bits64 aSig0, aSig1, bSig; |
3063 |
bits64 q, term0, term1, alternateASig0, alternateASig1; |
3064 |
floatx80 z; |
3065 |
|
3066 |
aSig0 = extractFloatx80Frac( a ); |
3067 |
aExp = extractFloatx80Exp( a ); |
3068 |
aSign = extractFloatx80Sign( a ); |
3069 |
bSig = extractFloatx80Frac( b ); |
3070 |
bExp = extractFloatx80Exp( b ); |
3071 |
bSign = extractFloatx80Sign( b ); |
3072 |
if ( aExp == 0x7FFF ) { |
3073 |
if ( (bits64) ( aSig0<<1 ) |
3074 |
|| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { |
3075 |
return propagateFloatx80NaN( a, b );
|
3076 |
} |
3077 |
goto invalid;
|
3078 |
} |
3079 |
if ( bExp == 0x7FFF ) { |
3080 |
if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3081 |
return a;
|
3082 |
} |
3083 |
if ( bExp == 0 ) { |
3084 |
if ( bSig == 0 ) { |
3085 |
invalid:
|
3086 |
float_raise( float_flag_invalid ); |
3087 |
z.low = floatx80_default_nan_low; |
3088 |
z.high = floatx80_default_nan_high; |
3089 |
return z;
|
3090 |
} |
3091 |
normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
3092 |
} |
3093 |
if ( aExp == 0 ) { |
3094 |
if ( (bits64) ( aSig0<<1 ) == 0 ) return a; |
3095 |
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); |
3096 |
} |
3097 |
bSig |= LIT64( 0x8000000000000000 );
|
3098 |
zSign = aSign; |
3099 |
expDiff = aExp - bExp; |
3100 |
aSig1 = 0;
|
3101 |
if ( expDiff < 0 ) { |
3102 |
if ( expDiff < -1 ) return a; |
3103 |
shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); |
3104 |
expDiff = 0;
|
3105 |
} |
3106 |
q = ( bSig <= aSig0 ); |
3107 |
if ( q ) aSig0 -= bSig;
|
3108 |
expDiff -= 64;
|
3109 |
while ( 0 < expDiff ) { |
3110 |
q = estimateDiv128To64( aSig0, aSig1, bSig ); |
3111 |
q = ( 2 < q ) ? q - 2 : 0; |
3112 |
mul64To128( bSig, q, &term0, &term1 ); |
3113 |
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
3114 |
shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
|
3115 |
expDiff -= 62;
|
3116 |
} |
3117 |
expDiff += 64;
|
3118 |
if ( 0 < expDiff ) { |
3119 |
q = estimateDiv128To64( aSig0, aSig1, bSig ); |
3120 |
q = ( 2 < q ) ? q - 2 : 0; |
3121 |
q >>= 64 - expDiff;
|
3122 |
mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
|
3123 |
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
3124 |
shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); |
3125 |
while ( le128( term0, term1, aSig0, aSig1 ) ) {
|
3126 |
++q; |
3127 |
sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
3128 |
} |
3129 |
} |
3130 |
else {
|
3131 |
term1 = 0;
|
3132 |
term0 = bSig; |
3133 |
} |
3134 |
sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); |
3135 |
if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
|
3136 |
|| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) |
3137 |
&& ( q & 1 ) )
|
3138 |
) { |
3139 |
aSig0 = alternateASig0; |
3140 |
aSig1 = alternateASig1; |
3141 |
zSign = ! zSign; |
3142 |
} |
3143 |
return
|
3144 |
normalizeRoundAndPackFloatx80( |
3145 |
80, zSign, bExp + expDiff, aSig0, aSig1 );
|
3146 |
|
3147 |
} |
3148 |
|
3149 |
/*
|
3150 |
-------------------------------------------------------------------------------
|
3151 |
Returns the square root of the extended double-precision floating-point
|
3152 |
value `a'. The operation is performed according to the IEC/IEEE Standard
|
3153 |
for Binary Floating-point Arithmetic.
|
3154 |
-------------------------------------------------------------------------------
|
3155 |
*/
|
3156 |
floatx80 floatx80_sqrt( floatx80 a ) |
3157 |
{ |
3158 |
flag aSign; |
3159 |
int32 aExp, zExp; |
3160 |
bits64 aSig0, aSig1, zSig0, zSig1; |
3161 |
bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; |
3162 |
bits64 shiftedRem0, shiftedRem1; |
3163 |
floatx80 z; |
3164 |
|
3165 |
aSig0 = extractFloatx80Frac( a ); |
3166 |
aExp = extractFloatx80Exp( a ); |
3167 |
aSign = extractFloatx80Sign( a ); |
3168 |
if ( aExp == 0x7FFF ) { |
3169 |
if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); |
3170 |
if ( ! aSign ) return a; |
3171 |
goto invalid;
|
3172 |
} |
3173 |
if ( aSign ) {
|
3174 |
if ( ( aExp | aSig0 ) == 0 ) return a; |
3175 |
invalid:
|
3176 |
float_raise( float_flag_invalid ); |
3177 |
z.low = floatx80_default_nan_low; |
3178 |
z.high = floatx80_default_nan_high; |
3179 |
return z;
|
3180 |
} |
3181 |
if ( aExp == 0 ) { |
3182 |
if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); |
3183 |
normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); |
3184 |
} |
3185 |
zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; |
3186 |
zSig0 = estimateSqrt32( aExp, aSig0>>32 );
|
3187 |
zSig0 <<= 31;
|
3188 |
aSig1 = 0;
|
3189 |
shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 ); |
3190 |
zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
|
3191 |
if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF ); |
3192 |
shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
|
3193 |
mul64To128( zSig0, zSig0, &term0, &term1 ); |
3194 |
sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); |
3195 |
while ( (sbits64) rem0 < 0 ) { |
3196 |
--zSig0; |
3197 |
shortShift128Left( 0, zSig0, 1, &term0, &term1 ); |
3198 |
term1 |= 1;
|
3199 |
add128( rem0, rem1, term0, term1, &rem0, &rem1 ); |
3200 |
} |
3201 |
shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
|
3202 |
zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 ); |
3203 |
if ( (bits64) ( zSig1<<1 ) <= 10 ) { |
3204 |
if ( zSig1 == 0 ) zSig1 = 1; |
3205 |
mul64To128( zSig0, zSig1, &term1, &term2 ); |
3206 |
shortShift128Left( term1, term2, 1, &term1, &term2 );
|
3207 |
sub128( rem1, 0, term1, term2, &rem1, &rem2 );
|
3208 |
mul64To128( zSig1, zSig1, &term2, &term3 ); |
3209 |
sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); |
3210 |
while ( (sbits64) rem1 < 0 ) { |
3211 |
--zSig1; |
3212 |
shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 ); |
3213 |
term3 |= 1;
|
3214 |
add192( |
3215 |
rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 ); |
3216 |
} |
3217 |
zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
|
3218 |
} |
3219 |
return
|
3220 |
roundAndPackFloatx80( |
3221 |
floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
|
3222 |
|
3223 |
} |
3224 |
|
3225 |
/*
|
3226 |
-------------------------------------------------------------------------------
|
3227 |
Returns 1 if the extended double-precision floating-point value `a' is
|
3228 |
equal to the corresponding value `b', and 0 otherwise. The comparison is
|
3229 |
performed according to the IEC/IEEE Standard for Binary Floating-point
|
3230 |
Arithmetic.
|
3231 |
-------------------------------------------------------------------------------
|
3232 |
*/
|
3233 |
flag floatx80_eq( floatx80 a, floatx80 b ) |
3234 |
{ |
3235 |
|
3236 |
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
3237 |
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
|
3238 |
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
|
3239 |
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
|
3240 |
) { |
3241 |
if ( floatx80_is_signaling_nan( a )
|
3242 |
|| floatx80_is_signaling_nan( b ) ) { |
3243 |
float_raise( float_flag_invalid ); |
3244 |
} |
3245 |
return 0; |
3246 |
} |
3247 |
return
|
3248 |
( a.low == b.low ) |
3249 |
&& ( ( a.high == b.high ) |
3250 |
|| ( ( a.low == 0 )
|
3251 |
&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) |
3252 |
); |
3253 |
|
3254 |
} |
3255 |
|
3256 |
/*
|
3257 |
-------------------------------------------------------------------------------
|
3258 |
Returns 1 if the extended double-precision floating-point value `a' is
|
3259 |
less than or equal to the corresponding value `b', and 0 otherwise. The
|
3260 |
comparison is performed according to the IEC/IEEE Standard for Binary
|
3261 |
Floating-point Arithmetic.
|
3262 |
-------------------------------------------------------------------------------
|
3263 |
*/
|
3264 |
flag floatx80_le( floatx80 a, floatx80 b ) |
3265 |
{ |
3266 |
flag aSign, bSign; |
3267 |
|
3268 |
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
3269 |
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
|
3270 |
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
|
3271 |
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
|
3272 |
) { |
3273 |
float_raise( float_flag_invalid ); |
3274 |
return 0; |
3275 |
} |
3276 |
aSign = extractFloatx80Sign( a ); |
3277 |
bSign = extractFloatx80Sign( b ); |
3278 |
if ( aSign != bSign ) {
|
3279 |
return
|
3280 |
aSign |
3281 |
|| ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
|
3282 |
== 0 );
|
3283 |
} |
3284 |
return
|
3285 |
aSign ? le128( b.high, b.low, a.high, a.low ) |
3286 |
: le128( a.high, a.low, b.high, b.low ); |
3287 |
|
3288 |
} |
3289 |
|
3290 |
/*
|
3291 |
-------------------------------------------------------------------------------
|
3292 |
Returns 1 if the extended double-precision floating-point value `a' is
|
3293 |
less than the corresponding value `b', and 0 otherwise. The comparison
|
3294 |
is performed according to the IEC/IEEE Standard for Binary Floating-point
|
3295 |
Arithmetic.
|
3296 |
-------------------------------------------------------------------------------
|
3297 |
*/
|
3298 |
flag floatx80_lt( floatx80 a, floatx80 b ) |
3299 |
{ |
3300 |
flag aSign, bSign; |
3301 |
|
3302 |
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
3303 |
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
|
3304 |
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
|
3305 |
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
|
3306 |
) { |
3307 |
float_raise( float_flag_invalid ); |
3308 |
return 0; |
3309 |
} |
3310 |
aSign = extractFloatx80Sign( a ); |
3311 |
bSign = extractFloatx80Sign( b ); |
3312 |
if ( aSign != bSign ) {
|
3313 |
return
|
3314 |
aSign |
3315 |
&& ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
|
3316 |
!= 0 );
|
3317 |
} |
3318 |
return
|
3319 |
aSign ? lt128( b.high, b.low, a.high, a.low ) |
3320 |
: lt128( a.high, a.low, b.high, b.low ); |
3321 |
|
3322 |
} |
3323 |
|
3324 |
/*
|
3325 |
-------------------------------------------------------------------------------
|
3326 |
Returns 1 if the extended double-precision floating-point value `a' is equal
|
3327 |
to the corresponding value `b', and 0 otherwise. The invalid exception is
|
3328 |
raised if either operand is a NaN. Otherwise, the comparison is performed
|
3329 |
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
3330 |
-------------------------------------------------------------------------------
|
3331 |
*/
|
3332 |
flag floatx80_eq_signaling( floatx80 a, floatx80 b ) |
3333 |
{ |
3334 |
|
3335 |
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
3336 |
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
|
3337 |
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
|
3338 |
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
|
3339 |
) { |
3340 |
float_raise( float_flag_invalid ); |
3341 |
return 0; |
3342 |
} |
3343 |
return
|
3344 |
( a.low == b.low ) |
3345 |
&& ( ( a.high == b.high ) |
3346 |
|| ( ( a.low == 0 )
|
3347 |
&& ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) |
3348 |
); |
3349 |
|
3350 |
} |
3351 |
|
3352 |
/*
|
3353 |
-------------------------------------------------------------------------------
|
3354 |
Returns 1 if the extended double-precision floating-point value `a' is less
|
3355 |
than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
|
3356 |
do not cause an exception. Otherwise, the comparison is performed according
|
3357 |
to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
3358 |
-------------------------------------------------------------------------------
|
3359 |
*/
|
3360 |
flag floatx80_le_quiet( floatx80 a, floatx80 b ) |
3361 |
{ |
3362 |
flag aSign, bSign; |
3363 |
|
3364 |
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
3365 |
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
|
3366 |
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
|
3367 |
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
|
3368 |
) { |
3369 |
if ( floatx80_is_signaling_nan( a )
|
3370 |
|| floatx80_is_signaling_nan( b ) ) { |
3371 |
float_raise( float_flag_invalid ); |
3372 |
} |
3373 |
return 0; |
3374 |
} |
3375 |
aSign = extractFloatx80Sign( a ); |
3376 |
bSign = extractFloatx80Sign( b ); |
3377 |
if ( aSign != bSign ) {
|
3378 |
return
|
3379 |
aSign |
3380 |
|| ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
|
3381 |
== 0 );
|
3382 |
} |
3383 |
return
|
3384 |
aSign ? le128( b.high, b.low, a.high, a.low ) |
3385 |
: le128( a.high, a.low, b.high, b.low ); |
3386 |
|
3387 |
} |
3388 |
|
3389 |
/*
|
3390 |
-------------------------------------------------------------------------------
|
3391 |
Returns 1 if the extended double-precision floating-point value `a' is less
|
3392 |
than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
|
3393 |
an exception. Otherwise, the comparison is performed according to the
|
3394 |
IEC/IEEE Standard for Binary Floating-point Arithmetic.
|
3395 |
-------------------------------------------------------------------------------
|
3396 |
*/
|
3397 |
flag floatx80_lt_quiet( floatx80 a, floatx80 b ) |
3398 |
{ |
3399 |
flag aSign, bSign; |
3400 |
|
3401 |
if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
3402 |
&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
|
3403 |
|| ( ( extractFloatx80Exp( b ) == 0x7FFF )
|
3404 |
&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
|
3405 |
) { |
3406 |
if ( floatx80_is_signaling_nan( a )
|
3407 |
|| floatx80_is_signaling_nan( b ) ) { |
3408 |
float_raise( float_flag_invalid ); |
3409 |
} |
3410 |
return 0; |
3411 |
} |
3412 |
aSign = extractFloatx80Sign( a ); |
3413 |
bSign = extractFloatx80Sign( b ); |
3414 |
if ( aSign != bSign ) {
|
3415 |
return
|
3416 |
aSign |
3417 |
&& ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
|
3418 |
!= 0 );
|
3419 |
} |
3420 |
return
|
3421 |
aSign ? lt128( b.high, b.low, a.high, a.low ) |
3422 |
: lt128( a.high, a.low, b.high, b.low ); |
3423 |
|
3424 |
} |
3425 |
|
3426 |
#endif
|
3427 |
|