Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 8229c991

History | View | Annotate | Download (204.6 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
/* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
34
   be flushed to zero.  */
35
#include "softfloat.h"
36

    
37
/*----------------------------------------------------------------------------
38
| Primitive arithmetic functions, including multi-word arithmetic, and
39
| division and square root approximations.  (Can be specialized to target if
40
| desired.)
41
*----------------------------------------------------------------------------*/
42
#include "softfloat-macros.h"
43

    
44
/*----------------------------------------------------------------------------
45
| Functions and definitions to determine:  (1) whether tininess for underflow
46
| is detected before or after rounding by default, (2) what (if anything)
47
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
48
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49
| are propagated from function inputs to output.  These details are target-
50
| specific.
51
*----------------------------------------------------------------------------*/
52
#include "softfloat-specialize.h"
53

    
54
void set_float_rounding_mode(int val STATUS_PARAM)
55
{
56
    STATUS(float_rounding_mode) = val;
57
}
58

    
59
void set_float_exception_flags(int val STATUS_PARAM)
60
{
61
    STATUS(float_exception_flags) = val;
62
}
63

    
64
#ifdef FLOATX80
65
void set_floatx80_rounding_precision(int val STATUS_PARAM)
66
{
67
    STATUS(floatx80_rounding_precision) = val;
68
}
69
#endif
70

    
71
/*----------------------------------------------------------------------------
72
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73
| and 7, and returns the properly rounded 32-bit integer corresponding to the
74
| input.  If `zSign' is 1, the input is negated before being converted to an
75
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
76
| is simply rounded to an integer, with the inexact exception raised if the
77
| input cannot be represented exactly as an integer.  However, if the fixed-
78
| point input is too large, the invalid exception is raised and the largest
79
| positive or negative integer is returned.
80
*----------------------------------------------------------------------------*/
81

    
82
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83
{
84
    int8 roundingMode;
85
    flag roundNearestEven;
86
    int8 roundIncrement, roundBits;
87
    int32 z;
88

    
89
    roundingMode = STATUS(float_rounding_mode);
90
    roundNearestEven = ( roundingMode == float_round_nearest_even );
91
    roundIncrement = 0x40;
92
    if ( ! roundNearestEven ) {
93
        if ( roundingMode == float_round_to_zero ) {
94
            roundIncrement = 0;
95
        }
96
        else {
97
            roundIncrement = 0x7F;
98
            if ( zSign ) {
99
                if ( roundingMode == float_round_up ) roundIncrement = 0;
100
            }
101
            else {
102
                if ( roundingMode == float_round_down ) roundIncrement = 0;
103
            }
104
        }
105
    }
106
    roundBits = absZ & 0x7F;
107
    absZ = ( absZ + roundIncrement )>>7;
108
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109
    z = absZ;
110
    if ( zSign ) z = - z;
111
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112
        float_raise( float_flag_invalid STATUS_VAR);
113
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114
    }
115
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116
    return z;
117

    
118
}
119

    
120
/*----------------------------------------------------------------------------
121
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122
| `absZ1', with binary point between bits 63 and 64 (between the input words),
123
| and returns the properly rounded 64-bit integer corresponding to the input.
124
| If `zSign' is 1, the input is negated before being converted to an integer.
125
| Ordinarily, the fixed-point input is simply rounded to an integer, with
126
| the inexact exception raised if the input cannot be represented exactly as
127
| an integer.  However, if the fixed-point input is too large, the invalid
128
| exception is raised and the largest positive or negative integer is
129
| returned.
130
*----------------------------------------------------------------------------*/
131

    
132
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133
{
134
    int8 roundingMode;
135
    flag roundNearestEven, increment;
136
    int64 z;
137

    
138
    roundingMode = STATUS(float_rounding_mode);
139
    roundNearestEven = ( roundingMode == float_round_nearest_even );
140
    increment = ( (sbits64) absZ1 < 0 );
141
    if ( ! roundNearestEven ) {
142
        if ( roundingMode == float_round_to_zero ) {
143
            increment = 0;
144
        }
145
        else {
146
            if ( zSign ) {
147
                increment = ( roundingMode == float_round_down ) && absZ1;
148
            }
149
            else {
150
                increment = ( roundingMode == float_round_up ) && absZ1;
151
            }
152
        }
153
    }
154
    if ( increment ) {
155
        ++absZ0;
156
        if ( absZ0 == 0 ) goto overflow;
157
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158
    }
159
    z = absZ0;
160
    if ( zSign ) z = - z;
161
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
162
 overflow:
163
        float_raise( float_flag_invalid STATUS_VAR);
164
        return
165
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
166
            : LIT64( 0x7FFFFFFFFFFFFFFF );
167
    }
168
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169
    return z;
170

    
171
}
172

    
173
/*----------------------------------------------------------------------------
174
| Returns the fraction bits of the single-precision floating-point value `a'.
175
*----------------------------------------------------------------------------*/
176

    
177
INLINE bits32 extractFloat32Frac( float32 a )
178
{
179

    
180
    return float32_val(a) & 0x007FFFFF;
181

    
182
}
183

    
184
/*----------------------------------------------------------------------------
185
| Returns the exponent bits of the single-precision floating-point value `a'.
186
*----------------------------------------------------------------------------*/
187

    
188
INLINE int16 extractFloat32Exp( float32 a )
189
{
190

    
191
    return ( float32_val(a)>>23 ) & 0xFF;
192

    
193
}
194

    
195
/*----------------------------------------------------------------------------
196
| Returns the sign bit of the single-precision floating-point value `a'.
197
*----------------------------------------------------------------------------*/
198

    
199
INLINE flag extractFloat32Sign( float32 a )
200
{
201

    
202
    return float32_val(a)>>31;
203

    
204
}
205

    
206
/*----------------------------------------------------------------------------
207
| Normalizes the subnormal single-precision floating-point value represented
208
| by the denormalized significand `aSig'.  The normalized exponent and
209
| significand are stored at the locations pointed to by `zExpPtr' and
210
| `zSigPtr', respectively.
211
*----------------------------------------------------------------------------*/
212

    
213
static void
214
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
215
{
216
    int8 shiftCount;
217

    
218
    shiftCount = countLeadingZeros32( aSig ) - 8;
219
    *zSigPtr = aSig<<shiftCount;
220
    *zExpPtr = 1 - shiftCount;
221

    
222
}
223

    
224
/*----------------------------------------------------------------------------
225
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226
| single-precision floating-point value, returning the result.  After being
227
| shifted into the proper positions, the three fields are simply added
228
| together to form the result.  This means that any integer portion of `zSig'
229
| will be added into the exponent.  Since a properly normalized significand
230
| will have an integer portion equal to 1, the `zExp' input should be 1 less
231
| than the desired result exponent whenever `zSig' is a complete, normalized
232
| significand.
233
*----------------------------------------------------------------------------*/
234

    
235
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236
{
237

    
238
    return make_float32(
239
          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240

    
241
}
242

    
243
/*----------------------------------------------------------------------------
244
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245
| and significand `zSig', and returns the proper single-precision floating-
246
| point value corresponding to the abstract input.  Ordinarily, the abstract
247
| value is simply rounded and packed into the single-precision format, with
248
| the inexact exception raised if the abstract input cannot be represented
249
| exactly.  However, if the abstract value is too large, the overflow and
250
| inexact exceptions are raised and an infinity or maximal finite value is
251
| returned.  If the abstract value is too small, the input value is rounded to
252
| a subnormal number, and the underflow and inexact exceptions are raised if
253
| the abstract input cannot be represented exactly as a subnormal single-
254
| precision floating-point number.
255
|     The input significand `zSig' has its binary point between bits 30
256
| and 29, which is 7 bits to the left of the usual location.  This shifted
257
| significand must be normalized or smaller.  If `zSig' is not normalized,
258
| `zExp' must be 0; in that case, the result returned is a subnormal number,
259
| and it must not require rounding.  In the usual case that `zSig' is
260
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261
| The handling of underflow and overflow follows the IEC/IEEE Standard for
262
| Binary Floating-Point Arithmetic.
263
*----------------------------------------------------------------------------*/
264

    
265
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266
{
267
    int8 roundingMode;
268
    flag roundNearestEven;
269
    int8 roundIncrement, roundBits;
270
    flag isTiny;
271

    
272
    roundingMode = STATUS(float_rounding_mode);
273
    roundNearestEven = ( roundingMode == float_round_nearest_even );
274
    roundIncrement = 0x40;
275
    if ( ! roundNearestEven ) {
276
        if ( roundingMode == float_round_to_zero ) {
277
            roundIncrement = 0;
278
        }
279
        else {
280
            roundIncrement = 0x7F;
281
            if ( zSign ) {
282
                if ( roundingMode == float_round_up ) roundIncrement = 0;
283
            }
284
            else {
285
                if ( roundingMode == float_round_down ) roundIncrement = 0;
286
            }
287
        }
288
    }
289
    roundBits = zSig & 0x7F;
290
    if ( 0xFD <= (bits16) zExp ) {
291
        if (    ( 0xFD < zExp )
292
             || (    ( zExp == 0xFD )
293
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294
           ) {
295
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297
        }
298
        if ( zExp < 0 ) {
299
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300
            isTiny =
301
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302
                || ( zExp < -1 )
303
                || ( zSig + roundIncrement < 0x80000000 );
304
            shift32RightJamming( zSig, - zExp, &zSig );
305
            zExp = 0;
306
            roundBits = zSig & 0x7F;
307
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308
        }
309
    }
310
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311
    zSig = ( zSig + roundIncrement )>>7;
312
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313
    if ( zSig == 0 ) zExp = 0;
314
    return packFloat32( zSign, zExp, zSig );
315

    
316
}
317

    
318
/*----------------------------------------------------------------------------
319
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320
| and significand `zSig', and returns the proper single-precision floating-
321
| point value corresponding to the abstract input.  This routine is just like
322
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324
| floating-point exponent.
325
*----------------------------------------------------------------------------*/
326

    
327
static float32
328
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329
{
330
    int8 shiftCount;
331

    
332
    shiftCount = countLeadingZeros32( zSig ) - 1;
333
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334

    
335
}
336

    
337
/*----------------------------------------------------------------------------
338
| Returns the fraction bits of the double-precision floating-point value `a'.
339
*----------------------------------------------------------------------------*/
340

    
341
INLINE bits64 extractFloat64Frac( float64 a )
342
{
343

    
344
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345

    
346
}
347

    
348
/*----------------------------------------------------------------------------
349
| Returns the exponent bits of the double-precision floating-point value `a'.
350
*----------------------------------------------------------------------------*/
351

    
352
INLINE int16 extractFloat64Exp( float64 a )
353
{
354

    
355
    return ( float64_val(a)>>52 ) & 0x7FF;
356

    
357
}
358

    
359
/*----------------------------------------------------------------------------
360
| Returns the sign bit of the double-precision floating-point value `a'.
361
*----------------------------------------------------------------------------*/
362

    
363
INLINE flag extractFloat64Sign( float64 a )
364
{
365

    
366
    return float64_val(a)>>63;
367

    
368
}
369

    
370
/*----------------------------------------------------------------------------
371
| Normalizes the subnormal double-precision floating-point value represented
372
| by the denormalized significand `aSig'.  The normalized exponent and
373
| significand are stored at the locations pointed to by `zExpPtr' and
374
| `zSigPtr', respectively.
375
*----------------------------------------------------------------------------*/
376

    
377
static void
378
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
379
{
380
    int8 shiftCount;
381

    
382
    shiftCount = countLeadingZeros64( aSig ) - 11;
383
    *zSigPtr = aSig<<shiftCount;
384
    *zExpPtr = 1 - shiftCount;
385

    
386
}
387

    
388
/*----------------------------------------------------------------------------
389
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390
| double-precision floating-point value, returning the result.  After being
391
| shifted into the proper positions, the three fields are simply added
392
| together to form the result.  This means that any integer portion of `zSig'
393
| will be added into the exponent.  Since a properly normalized significand
394
| will have an integer portion equal to 1, the `zExp' input should be 1 less
395
| than the desired result exponent whenever `zSig' is a complete, normalized
396
| significand.
397
*----------------------------------------------------------------------------*/
398

    
399
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400
{
401

    
402
    return make_float64(
403
        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404

    
405
}
406

    
407
/*----------------------------------------------------------------------------
408
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409
| and significand `zSig', and returns the proper double-precision floating-
410
| point value corresponding to the abstract input.  Ordinarily, the abstract
411
| value is simply rounded and packed into the double-precision format, with
412
| the inexact exception raised if the abstract input cannot be represented
413
| exactly.  However, if the abstract value is too large, the overflow and
414
| inexact exceptions are raised and an infinity or maximal finite value is
415
| returned.  If the abstract value is too small, the input value is rounded
416
| to a subnormal number, and the underflow and inexact exceptions are raised
417
| if the abstract input cannot be represented exactly as a subnormal double-
418
| precision floating-point number.
419
|     The input significand `zSig' has its binary point between bits 62
420
| and 61, which is 10 bits to the left of the usual location.  This shifted
421
| significand must be normalized or smaller.  If `zSig' is not normalized,
422
| `zExp' must be 0; in that case, the result returned is a subnormal number,
423
| and it must not require rounding.  In the usual case that `zSig' is
424
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425
| The handling of underflow and overflow follows the IEC/IEEE Standard for
426
| Binary Floating-Point Arithmetic.
427
*----------------------------------------------------------------------------*/
428

    
429
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430
{
431
    int8 roundingMode;
432
    flag roundNearestEven;
433
    int16 roundIncrement, roundBits;
434
    flag isTiny;
435

    
436
    roundingMode = STATUS(float_rounding_mode);
437
    roundNearestEven = ( roundingMode == float_round_nearest_even );
438
    roundIncrement = 0x200;
439
    if ( ! roundNearestEven ) {
440
        if ( roundingMode == float_round_to_zero ) {
441
            roundIncrement = 0;
442
        }
443
        else {
444
            roundIncrement = 0x3FF;
445
            if ( zSign ) {
446
                if ( roundingMode == float_round_up ) roundIncrement = 0;
447
            }
448
            else {
449
                if ( roundingMode == float_round_down ) roundIncrement = 0;
450
            }
451
        }
452
    }
453
    roundBits = zSig & 0x3FF;
454
    if ( 0x7FD <= (bits16) zExp ) {
455
        if (    ( 0x7FD < zExp )
456
             || (    ( zExp == 0x7FD )
457
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458
           ) {
459
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461
        }
462
        if ( zExp < 0 ) {
463
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464
            isTiny =
465
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466
                || ( zExp < -1 )
467
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468
            shift64RightJamming( zSig, - zExp, &zSig );
469
            zExp = 0;
470
            roundBits = zSig & 0x3FF;
471
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472
        }
473
    }
474
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475
    zSig = ( zSig + roundIncrement )>>10;
476
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477
    if ( zSig == 0 ) zExp = 0;
478
    return packFloat64( zSign, zExp, zSig );
479

    
480
}
481

    
482
/*----------------------------------------------------------------------------
483
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484
| and significand `zSig', and returns the proper double-precision floating-
485
| point value corresponding to the abstract input.  This routine is just like
486
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488
| floating-point exponent.
489
*----------------------------------------------------------------------------*/
490

    
491
static float64
492
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493
{
494
    int8 shiftCount;
495

    
496
    shiftCount = countLeadingZeros64( zSig ) - 1;
497
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498

    
499
}
500

    
501
#ifdef FLOATX80
502

    
503
/*----------------------------------------------------------------------------
504
| Returns the fraction bits of the extended double-precision floating-point
505
| value `a'.
506
*----------------------------------------------------------------------------*/
507

    
508
INLINE bits64 extractFloatx80Frac( floatx80 a )
509
{
510

    
511
    return a.low;
512

    
513
}
514

    
515
/*----------------------------------------------------------------------------
516
| Returns the exponent bits of the extended double-precision floating-point
517
| value `a'.
518
*----------------------------------------------------------------------------*/
519

    
520
INLINE int32 extractFloatx80Exp( floatx80 a )
521
{
522

    
523
    return a.high & 0x7FFF;
524

    
525
}
526

    
527
/*----------------------------------------------------------------------------
528
| Returns the sign bit of the extended double-precision floating-point value
529
| `a'.
530
*----------------------------------------------------------------------------*/
531

    
532
INLINE flag extractFloatx80Sign( floatx80 a )
533
{
534

    
535
    return a.high>>15;
536

    
537
}
538

    
539
/*----------------------------------------------------------------------------
540
| Normalizes the subnormal extended double-precision floating-point value
541
| represented by the denormalized significand `aSig'.  The normalized exponent
542
| and significand are stored at the locations pointed to by `zExpPtr' and
543
| `zSigPtr', respectively.
544
*----------------------------------------------------------------------------*/
545

    
546
static void
547
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
548
{
549
    int8 shiftCount;
550

    
551
    shiftCount = countLeadingZeros64( aSig );
552
    *zSigPtr = aSig<<shiftCount;
553
    *zExpPtr = 1 - shiftCount;
554

    
555
}
556

    
557
/*----------------------------------------------------------------------------
558
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559
| extended double-precision floating-point value, returning the result.
560
*----------------------------------------------------------------------------*/
561

    
562
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563
{
564
    floatx80 z;
565

    
566
    z.low = zSig;
567
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568
    return z;
569

    
570
}
571

    
572
/*----------------------------------------------------------------------------
573
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
575
| and returns the proper extended double-precision floating-point value
576
| corresponding to the abstract input.  Ordinarily, the abstract value is
577
| rounded and packed into the extended double-precision format, with the
578
| inexact exception raised if the abstract input cannot be represented
579
| exactly.  However, if the abstract value is too large, the overflow and
580
| inexact exceptions are raised and an infinity or maximal finite value is
581
| returned.  If the abstract value is too small, the input value is rounded to
582
| a subnormal number, and the underflow and inexact exceptions are raised if
583
| the abstract input cannot be represented exactly as a subnormal extended
584
| double-precision floating-point number.
585
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
586
| number of bits as single or double precision, respectively.  Otherwise, the
587
| result is rounded to the full precision of the extended double-precision
588
| format.
589
|     The input significand must be normalized or smaller.  If the input
590
| significand is not normalized, `zExp' must be 0; in that case, the result
591
| returned is a subnormal number, and it must not require rounding.  The
592
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593
| Floating-Point Arithmetic.
594
*----------------------------------------------------------------------------*/
595

    
596
static floatx80
597
 roundAndPackFloatx80(
598
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599
 STATUS_PARAM)
600
{
601
    int8 roundingMode;
602
    flag roundNearestEven, increment, isTiny;
603
    int64 roundIncrement, roundMask, roundBits;
604

    
605
    roundingMode = STATUS(float_rounding_mode);
606
    roundNearestEven = ( roundingMode == float_round_nearest_even );
607
    if ( roundingPrecision == 80 ) goto precision80;
608
    if ( roundingPrecision == 64 ) {
609
        roundIncrement = LIT64( 0x0000000000000400 );
610
        roundMask = LIT64( 0x00000000000007FF );
611
    }
612
    else if ( roundingPrecision == 32 ) {
613
        roundIncrement = LIT64( 0x0000008000000000 );
614
        roundMask = LIT64( 0x000000FFFFFFFFFF );
615
    }
616
    else {
617
        goto precision80;
618
    }
619
    zSig0 |= ( zSig1 != 0 );
620
    if ( ! roundNearestEven ) {
621
        if ( roundingMode == float_round_to_zero ) {
622
            roundIncrement = 0;
623
        }
624
        else {
625
            roundIncrement = roundMask;
626
            if ( zSign ) {
627
                if ( roundingMode == float_round_up ) roundIncrement = 0;
628
            }
629
            else {
630
                if ( roundingMode == float_round_down ) roundIncrement = 0;
631
            }
632
        }
633
    }
634
    roundBits = zSig0 & roundMask;
635
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636
        if (    ( 0x7FFE < zExp )
637
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638
           ) {
639
            goto overflow;
640
        }
641
        if ( zExp <= 0 ) {
642
            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643
            isTiny =
644
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645
                || ( zExp < 0 )
646
                || ( zSig0 <= zSig0 + roundIncrement );
647
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648
            zExp = 0;
649
            roundBits = zSig0 & roundMask;
650
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652
            zSig0 += roundIncrement;
653
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
654
            roundIncrement = roundMask + 1;
655
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656
                roundMask |= roundIncrement;
657
            }
658
            zSig0 &= ~ roundMask;
659
            return packFloatx80( zSign, zExp, zSig0 );
660
        }
661
    }
662
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663
    zSig0 += roundIncrement;
664
    if ( zSig0 < roundIncrement ) {
665
        ++zExp;
666
        zSig0 = LIT64( 0x8000000000000000 );
667
    }
668
    roundIncrement = roundMask + 1;
669
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670
        roundMask |= roundIncrement;
671
    }
672
    zSig0 &= ~ roundMask;
673
    if ( zSig0 == 0 ) zExp = 0;
674
    return packFloatx80( zSign, zExp, zSig0 );
675
 precision80:
676
    increment = ( (sbits64) zSig1 < 0 );
677
    if ( ! roundNearestEven ) {
678
        if ( roundingMode == float_round_to_zero ) {
679
            increment = 0;
680
        }
681
        else {
682
            if ( zSign ) {
683
                increment = ( roundingMode == float_round_down ) && zSig1;
684
            }
685
            else {
686
                increment = ( roundingMode == float_round_up ) && zSig1;
687
            }
688
        }
689
    }
690
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691
        if (    ( 0x7FFE < zExp )
692
             || (    ( zExp == 0x7FFE )
693
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694
                  && increment
695
                )
696
           ) {
697
            roundMask = 0;
698
 overflow:
699
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700
            if (    ( roundingMode == float_round_to_zero )
701
                 || ( zSign && ( roundingMode == float_round_up ) )
702
                 || ( ! zSign && ( roundingMode == float_round_down ) )
703
               ) {
704
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705
            }
706
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707
        }
708
        if ( zExp <= 0 ) {
709
            isTiny =
710
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711
                || ( zExp < 0 )
712
                || ! increment
713
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715
            zExp = 0;
716
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718
            if ( roundNearestEven ) {
719
                increment = ( (sbits64) zSig1 < 0 );
720
            }
721
            else {
722
                if ( zSign ) {
723
                    increment = ( roundingMode == float_round_down ) && zSig1;
724
                }
725
                else {
726
                    increment = ( roundingMode == float_round_up ) && zSig1;
727
                }
728
            }
729
            if ( increment ) {
730
                ++zSig0;
731
                zSig0 &=
732
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
734
            }
735
            return packFloatx80( zSign, zExp, zSig0 );
736
        }
737
    }
738
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739
    if ( increment ) {
740
        ++zSig0;
741
        if ( zSig0 == 0 ) {
742
            ++zExp;
743
            zSig0 = LIT64( 0x8000000000000000 );
744
        }
745
        else {
746
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747
        }
748
    }
749
    else {
750
        if ( zSig0 == 0 ) zExp = 0;
751
    }
752
    return packFloatx80( zSign, zExp, zSig0 );
753

    
754
}
755

    
756
/*----------------------------------------------------------------------------
757
| Takes an abstract floating-point value having sign `zSign', exponent
758
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759
| and returns the proper extended double-precision floating-point value
760
| corresponding to the abstract input.  This routine is just like
761
| `roundAndPackFloatx80' except that the input significand does not have to be
762
| normalized.
763
*----------------------------------------------------------------------------*/
764

    
765
static floatx80
766
 normalizeRoundAndPackFloatx80(
767
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768
 STATUS_PARAM)
769
{
770
    int8 shiftCount;
771

    
772
    if ( zSig0 == 0 ) {
773
        zSig0 = zSig1;
774
        zSig1 = 0;
775
        zExp -= 64;
776
    }
777
    shiftCount = countLeadingZeros64( zSig0 );
778
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779
    zExp -= shiftCount;
780
    return
781
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782

    
783
}
784

    
785
#endif
786

    
787
#ifdef FLOAT128
788

    
789
/*----------------------------------------------------------------------------
790
| Returns the least-significant 64 fraction bits of the quadruple-precision
791
| floating-point value `a'.
792
*----------------------------------------------------------------------------*/
793

    
794
INLINE bits64 extractFloat128Frac1( float128 a )
795
{
796

    
797
    return a.low;
798

    
799
}
800

    
801
/*----------------------------------------------------------------------------
802
| Returns the most-significant 48 fraction bits of the quadruple-precision
803
| floating-point value `a'.
804
*----------------------------------------------------------------------------*/
805

    
806
INLINE bits64 extractFloat128Frac0( float128 a )
807
{
808

    
809
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810

    
811
}
812

    
813
/*----------------------------------------------------------------------------
814
| Returns the exponent bits of the quadruple-precision floating-point value
815
| `a'.
816
*----------------------------------------------------------------------------*/
817

    
818
INLINE int32 extractFloat128Exp( float128 a )
819
{
820

    
821
    return ( a.high>>48 ) & 0x7FFF;
822

    
823
}
824

    
825
/*----------------------------------------------------------------------------
826
| Returns the sign bit of the quadruple-precision floating-point value `a'.
827
*----------------------------------------------------------------------------*/
828

    
829
INLINE flag extractFloat128Sign( float128 a )
830
{
831

    
832
    return a.high>>63;
833

    
834
}
835

    
836
/*----------------------------------------------------------------------------
837
| Normalizes the subnormal quadruple-precision floating-point value
838
| represented by the denormalized significand formed by the concatenation of
839
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
840
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
841
| significand are stored at the location pointed to by `zSig0Ptr', and the
842
| least significant 64 bits of the normalized significand are stored at the
843
| location pointed to by `zSig1Ptr'.
844
*----------------------------------------------------------------------------*/
845

    
846
static void
847
 normalizeFloat128Subnormal(
848
     bits64 aSig0,
849
     bits64 aSig1,
850
     int32 *zExpPtr,
851
     bits64 *zSig0Ptr,
852
     bits64 *zSig1Ptr
853
 )
854
{
855
    int8 shiftCount;
856

    
857
    if ( aSig0 == 0 ) {
858
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
859
        if ( shiftCount < 0 ) {
860
            *zSig0Ptr = aSig1>>( - shiftCount );
861
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
862
        }
863
        else {
864
            *zSig0Ptr = aSig1<<shiftCount;
865
            *zSig1Ptr = 0;
866
        }
867
        *zExpPtr = - shiftCount - 63;
868
    }
869
    else {
870
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
871
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872
        *zExpPtr = 1 - shiftCount;
873
    }
874

    
875
}
876

    
877
/*----------------------------------------------------------------------------
878
| Packs the sign `zSign', the exponent `zExp', and the significand formed
879
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880
| floating-point value, returning the result.  After being shifted into the
881
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882
| added together to form the most significant 32 bits of the result.  This
883
| means that any integer portion of `zSig0' will be added into the exponent.
884
| Since a properly normalized significand will have an integer portion equal
885
| to 1, the `zExp' input should be 1 less than the desired result exponent
886
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887
| significand.
888
*----------------------------------------------------------------------------*/
889

    
890
INLINE float128
891
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892
{
893
    float128 z;
894

    
895
    z.low = zSig1;
896
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897
    return z;
898

    
899
}
900

    
901
/*----------------------------------------------------------------------------
902
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903
| and extended significand formed by the concatenation of `zSig0', `zSig1',
904
| and `zSig2', and returns the proper quadruple-precision floating-point value
905
| corresponding to the abstract input.  Ordinarily, the abstract value is
906
| simply rounded and packed into the quadruple-precision format, with the
907
| inexact exception raised if the abstract input cannot be represented
908
| exactly.  However, if the abstract value is too large, the overflow and
909
| inexact exceptions are raised and an infinity or maximal finite value is
910
| returned.  If the abstract value is too small, the input value is rounded to
911
| a subnormal number, and the underflow and inexact exceptions are raised if
912
| the abstract input cannot be represented exactly as a subnormal quadruple-
913
| precision floating-point number.
914
|     The input significand must be normalized or smaller.  If the input
915
| significand is not normalized, `zExp' must be 0; in that case, the result
916
| returned is a subnormal number, and it must not require rounding.  In the
917
| usual case that the input significand is normalized, `zExp' must be 1 less
918
| than the ``true'' floating-point exponent.  The handling of underflow and
919
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920
*----------------------------------------------------------------------------*/
921

    
922
static float128
923
 roundAndPackFloat128(
924
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925
{
926
    int8 roundingMode;
927
    flag roundNearestEven, increment, isTiny;
928

    
929
    roundingMode = STATUS(float_rounding_mode);
930
    roundNearestEven = ( roundingMode == float_round_nearest_even );
931
    increment = ( (sbits64) zSig2 < 0 );
932
    if ( ! roundNearestEven ) {
933
        if ( roundingMode == float_round_to_zero ) {
934
            increment = 0;
935
        }
936
        else {
937
            if ( zSign ) {
938
                increment = ( roundingMode == float_round_down ) && zSig2;
939
            }
940
            else {
941
                increment = ( roundingMode == float_round_up ) && zSig2;
942
            }
943
        }
944
    }
945
    if ( 0x7FFD <= (bits32) zExp ) {
946
        if (    ( 0x7FFD < zExp )
947
             || (    ( zExp == 0x7FFD )
948
                  && eq128(
949
                         LIT64( 0x0001FFFFFFFFFFFF ),
950
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
951
                         zSig0,
952
                         zSig1
953
                     )
954
                  && increment
955
                )
956
           ) {
957
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958
            if (    ( roundingMode == float_round_to_zero )
959
                 || ( zSign && ( roundingMode == float_round_up ) )
960
                 || ( ! zSign && ( roundingMode == float_round_down ) )
961
               ) {
962
                return
963
                    packFloat128(
964
                        zSign,
965
                        0x7FFE,
966
                        LIT64( 0x0000FFFFFFFFFFFF ),
967
                        LIT64( 0xFFFFFFFFFFFFFFFF )
968
                    );
969
            }
970
            return packFloat128( zSign, 0x7FFF, 0, 0 );
971
        }
972
        if ( zExp < 0 ) {
973
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974
            isTiny =
975
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976
                || ( zExp < -1 )
977
                || ! increment
978
                || lt128(
979
                       zSig0,
980
                       zSig1,
981
                       LIT64( 0x0001FFFFFFFFFFFF ),
982
                       LIT64( 0xFFFFFFFFFFFFFFFF )
983
                   );
984
            shift128ExtraRightJamming(
985
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986
            zExp = 0;
987
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988
            if ( roundNearestEven ) {
989
                increment = ( (sbits64) zSig2 < 0 );
990
            }
991
            else {
992
                if ( zSign ) {
993
                    increment = ( roundingMode == float_round_down ) && zSig2;
994
                }
995
                else {
996
                    increment = ( roundingMode == float_round_up ) && zSig2;
997
                }
998
            }
999
        }
1000
    }
1001
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002
    if ( increment ) {
1003
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005
    }
1006
    else {
1007
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008
    }
1009
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1010

    
1011
}
1012

    
1013
/*----------------------------------------------------------------------------
1014
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016
| returns the proper quadruple-precision floating-point value corresponding
1017
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1018
| except that the input significand has fewer bits and does not have to be
1019
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020
| point exponent.
1021
*----------------------------------------------------------------------------*/
1022

    
1023
static float128
1024
 normalizeRoundAndPackFloat128(
1025
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026
{
1027
    int8 shiftCount;
1028
    bits64 zSig2;
1029

    
1030
    if ( zSig0 == 0 ) {
1031
        zSig0 = zSig1;
1032
        zSig1 = 0;
1033
        zExp -= 64;
1034
    }
1035
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036
    if ( 0 <= shiftCount ) {
1037
        zSig2 = 0;
1038
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039
    }
1040
    else {
1041
        shift128ExtraRightJamming(
1042
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043
    }
1044
    zExp -= shiftCount;
1045
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046

    
1047
}
1048

    
1049
#endif
1050

    
1051
/*----------------------------------------------------------------------------
1052
| Returns the result of converting the 32-bit two's complement integer `a'
1053
| to the single-precision floating-point format.  The conversion is performed
1054
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055
*----------------------------------------------------------------------------*/
1056

    
1057
float32 int32_to_float32( int32 a STATUS_PARAM )
1058
{
1059
    flag zSign;
1060

    
1061
    if ( a == 0 ) return float32_zero;
1062
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063
    zSign = ( a < 0 );
1064
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065

    
1066
}
1067

    
1068
/*----------------------------------------------------------------------------
1069
| Returns the result of converting the 32-bit two's complement integer `a'
1070
| to the double-precision floating-point format.  The conversion is performed
1071
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072
*----------------------------------------------------------------------------*/
1073

    
1074
float64 int32_to_float64( int32 a STATUS_PARAM )
1075
{
1076
    flag zSign;
1077
    uint32 absA;
1078
    int8 shiftCount;
1079
    bits64 zSig;
1080

    
1081
    if ( a == 0 ) return float64_zero;
1082
    zSign = ( a < 0 );
1083
    absA = zSign ? - a : a;
1084
    shiftCount = countLeadingZeros32( absA ) + 21;
1085
    zSig = absA;
1086
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087

    
1088
}
1089

    
1090
#ifdef FLOATX80
1091

    
1092
/*----------------------------------------------------------------------------
1093
| Returns the result of converting the 32-bit two's complement integer `a'
1094
| to the extended double-precision floating-point format.  The conversion
1095
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096
| Arithmetic.
1097
*----------------------------------------------------------------------------*/
1098

    
1099
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100
{
1101
    flag zSign;
1102
    uint32 absA;
1103
    int8 shiftCount;
1104
    bits64 zSig;
1105

    
1106
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107
    zSign = ( a < 0 );
1108
    absA = zSign ? - a : a;
1109
    shiftCount = countLeadingZeros32( absA ) + 32;
1110
    zSig = absA;
1111
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112

    
1113
}
1114

    
1115
#endif
1116

    
1117
#ifdef FLOAT128
1118

    
1119
/*----------------------------------------------------------------------------
1120
| Returns the result of converting the 32-bit two's complement integer `a' to
1121
| the quadruple-precision floating-point format.  The conversion is performed
1122
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123
*----------------------------------------------------------------------------*/
1124

    
1125
float128 int32_to_float128( int32 a STATUS_PARAM )
1126
{
1127
    flag zSign;
1128
    uint32 absA;
1129
    int8 shiftCount;
1130
    bits64 zSig0;
1131

    
1132
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133
    zSign = ( a < 0 );
1134
    absA = zSign ? - a : a;
1135
    shiftCount = countLeadingZeros32( absA ) + 17;
1136
    zSig0 = absA;
1137
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138

    
1139
}
1140

    
1141
#endif
1142

    
1143
/*----------------------------------------------------------------------------
1144
| Returns the result of converting the 64-bit two's complement integer `a'
1145
| to the single-precision floating-point format.  The conversion is performed
1146
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147
*----------------------------------------------------------------------------*/
1148

    
1149
float32 int64_to_float32( int64 a STATUS_PARAM )
1150
{
1151
    flag zSign;
1152
    uint64 absA;
1153
    int8 shiftCount;
1154

    
1155
    if ( a == 0 ) return float32_zero;
1156
    zSign = ( a < 0 );
1157
    absA = zSign ? - a : a;
1158
    shiftCount = countLeadingZeros64( absA ) - 40;
1159
    if ( 0 <= shiftCount ) {
1160
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161
    }
1162
    else {
1163
        shiftCount += 7;
1164
        if ( shiftCount < 0 ) {
1165
            shift64RightJamming( absA, - shiftCount, &absA );
1166
        }
1167
        else {
1168
            absA <<= shiftCount;
1169
        }
1170
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171
    }
1172

    
1173
}
1174

    
1175
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176
{
1177
    int8 shiftCount;
1178

    
1179
    if ( a == 0 ) return float32_zero;
1180
    shiftCount = countLeadingZeros64( a ) - 40;
1181
    if ( 0 <= shiftCount ) {
1182
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183
    }
1184
    else {
1185
        shiftCount += 7;
1186
        if ( shiftCount < 0 ) {
1187
            shift64RightJamming( a, - shiftCount, &a );
1188
        }
1189
        else {
1190
            a <<= shiftCount;
1191
        }
1192
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193
    }
1194
}
1195

    
1196
/*----------------------------------------------------------------------------
1197
| Returns the result of converting the 64-bit two's complement integer `a'
1198
| to the double-precision floating-point format.  The conversion is performed
1199
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200
*----------------------------------------------------------------------------*/
1201

    
1202
float64 int64_to_float64( int64 a STATUS_PARAM )
1203
{
1204
    flag zSign;
1205

    
1206
    if ( a == 0 ) return float64_zero;
1207
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208
        return packFloat64( 1, 0x43E, 0 );
1209
    }
1210
    zSign = ( a < 0 );
1211
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212

    
1213
}
1214

    
1215
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216
{
1217
    if ( a == 0 ) return float64_zero;
1218
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219

    
1220
}
1221

    
1222
#ifdef FLOATX80
1223

    
1224
/*----------------------------------------------------------------------------
1225
| Returns the result of converting the 64-bit two's complement integer `a'
1226
| to the extended double-precision floating-point format.  The conversion
1227
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228
| Arithmetic.
1229
*----------------------------------------------------------------------------*/
1230

    
1231
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232
{
1233
    flag zSign;
1234
    uint64 absA;
1235
    int8 shiftCount;
1236

    
1237
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238
    zSign = ( a < 0 );
1239
    absA = zSign ? - a : a;
1240
    shiftCount = countLeadingZeros64( absA );
1241
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242

    
1243
}
1244

    
1245
#endif
1246

    
1247
#ifdef FLOAT128
1248

    
1249
/*----------------------------------------------------------------------------
1250
| Returns the result of converting the 64-bit two's complement integer `a' to
1251
| the quadruple-precision floating-point format.  The conversion is performed
1252
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253
*----------------------------------------------------------------------------*/
1254

    
1255
float128 int64_to_float128( int64 a STATUS_PARAM )
1256
{
1257
    flag zSign;
1258
    uint64 absA;
1259
    int8 shiftCount;
1260
    int32 zExp;
1261
    bits64 zSig0, zSig1;
1262

    
1263
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264
    zSign = ( a < 0 );
1265
    absA = zSign ? - a : a;
1266
    shiftCount = countLeadingZeros64( absA ) + 49;
1267
    zExp = 0x406E - shiftCount;
1268
    if ( 64 <= shiftCount ) {
1269
        zSig1 = 0;
1270
        zSig0 = absA;
1271
        shiftCount -= 64;
1272
    }
1273
    else {
1274
        zSig1 = absA;
1275
        zSig0 = 0;
1276
    }
1277
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1279

    
1280
}
1281

    
1282
#endif
1283

    
1284
/*----------------------------------------------------------------------------
1285
| Returns the result of converting the single-precision floating-point value
1286
| `a' to the 32-bit two's complement integer format.  The conversion is
1287
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288
| Arithmetic---which means in particular that the conversion is rounded
1289
| according to the current rounding mode.  If `a' is a NaN, the largest
1290
| positive integer is returned.  Otherwise, if the conversion overflows, the
1291
| largest integer with the same sign as `a' is returned.
1292
*----------------------------------------------------------------------------*/
1293

    
1294
int32 float32_to_int32( float32 a STATUS_PARAM )
1295
{
1296
    flag aSign;
1297
    int16 aExp, shiftCount;
1298
    bits32 aSig;
1299
    bits64 aSig64;
1300

    
1301
    aSig = extractFloat32Frac( a );
1302
    aExp = extractFloat32Exp( a );
1303
    aSign = extractFloat32Sign( a );
1304
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305
    if ( aExp ) aSig |= 0x00800000;
1306
    shiftCount = 0xAF - aExp;
1307
    aSig64 = aSig;
1308
    aSig64 <<= 32;
1309
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311

    
1312
}
1313

    
1314
/*----------------------------------------------------------------------------
1315
| Returns the result of converting the single-precision floating-point value
1316
| `a' to the 32-bit two's complement integer format.  The conversion is
1317
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318
| Arithmetic, except that the conversion is always rounded toward zero.
1319
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320
| the conversion overflows, the largest integer with the same sign as `a' is
1321
| returned.
1322
*----------------------------------------------------------------------------*/
1323

    
1324
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325
{
1326
    flag aSign;
1327
    int16 aExp, shiftCount;
1328
    bits32 aSig;
1329
    int32 z;
1330

    
1331
    aSig = extractFloat32Frac( a );
1332
    aExp = extractFloat32Exp( a );
1333
    aSign = extractFloat32Sign( a );
1334
    shiftCount = aExp - 0x9E;
1335
    if ( 0 <= shiftCount ) {
1336
        if ( float32_val(a) != 0xCF000000 ) {
1337
            float_raise( float_flag_invalid STATUS_VAR);
1338
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339
        }
1340
        return (sbits32) 0x80000000;
1341
    }
1342
    else if ( aExp <= 0x7E ) {
1343
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344
        return 0;
1345
    }
1346
    aSig = ( aSig | 0x00800000 )<<8;
1347
    z = aSig>>( - shiftCount );
1348
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349
        STATUS(float_exception_flags) |= float_flag_inexact;
1350
    }
1351
    if ( aSign ) z = - z;
1352
    return z;
1353

    
1354
}
1355

    
1356
/*----------------------------------------------------------------------------
1357
| Returns the result of converting the single-precision floating-point value
1358
| `a' to the 64-bit two's complement integer format.  The conversion is
1359
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360
| Arithmetic---which means in particular that the conversion is rounded
1361
| according to the current rounding mode.  If `a' is a NaN, the largest
1362
| positive integer is returned.  Otherwise, if the conversion overflows, the
1363
| largest integer with the same sign as `a' is returned.
1364
*----------------------------------------------------------------------------*/
1365

    
1366
int64 float32_to_int64( float32 a STATUS_PARAM )
1367
{
1368
    flag aSign;
1369
    int16 aExp, shiftCount;
1370
    bits32 aSig;
1371
    bits64 aSig64, aSigExtra;
1372

    
1373
    aSig = extractFloat32Frac( a );
1374
    aExp = extractFloat32Exp( a );
1375
    aSign = extractFloat32Sign( a );
1376
    shiftCount = 0xBE - aExp;
1377
    if ( shiftCount < 0 ) {
1378
        float_raise( float_flag_invalid STATUS_VAR);
1379
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1381
        }
1382
        return (sbits64) LIT64( 0x8000000000000000 );
1383
    }
1384
    if ( aExp ) aSig |= 0x00800000;
1385
    aSig64 = aSig;
1386
    aSig64 <<= 40;
1387
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389

    
1390
}
1391

    
1392
/*----------------------------------------------------------------------------
1393
| Returns the result of converting the single-precision floating-point value
1394
| `a' to the 64-bit two's complement integer format.  The conversion is
1395
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396
| Arithmetic, except that the conversion is always rounded toward zero.  If
1397
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398
| conversion overflows, the largest integer with the same sign as `a' is
1399
| returned.
1400
*----------------------------------------------------------------------------*/
1401

    
1402
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403
{
1404
    flag aSign;
1405
    int16 aExp, shiftCount;
1406
    bits32 aSig;
1407
    bits64 aSig64;
1408
    int64 z;
1409

    
1410
    aSig = extractFloat32Frac( a );
1411
    aExp = extractFloat32Exp( a );
1412
    aSign = extractFloat32Sign( a );
1413
    shiftCount = aExp - 0xBE;
1414
    if ( 0 <= shiftCount ) {
1415
        if ( float32_val(a) != 0xDF000000 ) {
1416
            float_raise( float_flag_invalid STATUS_VAR);
1417
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1419
            }
1420
        }
1421
        return (sbits64) LIT64( 0x8000000000000000 );
1422
    }
1423
    else if ( aExp <= 0x7E ) {
1424
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425
        return 0;
1426
    }
1427
    aSig64 = aSig | 0x00800000;
1428
    aSig64 <<= 40;
1429
    z = aSig64>>( - shiftCount );
1430
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431
        STATUS(float_exception_flags) |= float_flag_inexact;
1432
    }
1433
    if ( aSign ) z = - z;
1434
    return z;
1435

    
1436
}
1437

    
1438
/*----------------------------------------------------------------------------
1439
| Returns the result of converting the single-precision floating-point value
1440
| `a' to the double-precision floating-point format.  The conversion is
1441
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442
| Arithmetic.
1443
*----------------------------------------------------------------------------*/
1444

    
1445
float64 float32_to_float64( float32 a STATUS_PARAM )
1446
{
1447
    flag aSign;
1448
    int16 aExp;
1449
    bits32 aSig;
1450

    
1451
    aSig = extractFloat32Frac( a );
1452
    aExp = extractFloat32Exp( a );
1453
    aSign = extractFloat32Sign( a );
1454
    if ( aExp == 0xFF ) {
1455
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456
        return packFloat64( aSign, 0x7FF, 0 );
1457
    }
1458
    if ( aExp == 0 ) {
1459
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461
        --aExp;
1462
    }
1463
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464

    
1465
}
1466

    
1467
#ifdef FLOATX80
1468

    
1469
/*----------------------------------------------------------------------------
1470
| Returns the result of converting the single-precision floating-point value
1471
| `a' to the extended double-precision floating-point format.  The conversion
1472
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473
| Arithmetic.
1474
*----------------------------------------------------------------------------*/
1475

    
1476
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477
{
1478
    flag aSign;
1479
    int16 aExp;
1480
    bits32 aSig;
1481

    
1482
    aSig = extractFloat32Frac( a );
1483
    aExp = extractFloat32Exp( a );
1484
    aSign = extractFloat32Sign( a );
1485
    if ( aExp == 0xFF ) {
1486
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488
    }
1489
    if ( aExp == 0 ) {
1490
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492
    }
1493
    aSig |= 0x00800000;
1494
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495

    
1496
}
1497

    
1498
#endif
1499

    
1500
#ifdef FLOAT128
1501

    
1502
/*----------------------------------------------------------------------------
1503
| Returns the result of converting the single-precision floating-point value
1504
| `a' to the double-precision floating-point format.  The conversion is
1505
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506
| Arithmetic.
1507
*----------------------------------------------------------------------------*/
1508

    
1509
float128 float32_to_float128( float32 a STATUS_PARAM )
1510
{
1511
    flag aSign;
1512
    int16 aExp;
1513
    bits32 aSig;
1514

    
1515
    aSig = extractFloat32Frac( a );
1516
    aExp = extractFloat32Exp( a );
1517
    aSign = extractFloat32Sign( a );
1518
    if ( aExp == 0xFF ) {
1519
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1521
    }
1522
    if ( aExp == 0 ) {
1523
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525
        --aExp;
1526
    }
1527
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528

    
1529
}
1530

    
1531
#endif
1532

    
1533
/*----------------------------------------------------------------------------
1534
| Rounds the single-precision floating-point value `a' to an integer, and
1535
| returns the result as a single-precision floating-point value.  The
1536
| operation is performed according to the IEC/IEEE Standard for Binary
1537
| Floating-Point Arithmetic.
1538
*----------------------------------------------------------------------------*/
1539

    
1540
float32 float32_round_to_int( float32 a STATUS_PARAM)
1541
{
1542
    flag aSign;
1543
    int16 aExp;
1544
    bits32 lastBitMask, roundBitsMask;
1545
    int8 roundingMode;
1546
    bits32 z;
1547

    
1548
    aExp = extractFloat32Exp( a );
1549
    if ( 0x96 <= aExp ) {
1550
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551
            return propagateFloat32NaN( a, a STATUS_VAR );
1552
        }
1553
        return a;
1554
    }
1555
    if ( aExp <= 0x7E ) {
1556
        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557
        STATUS(float_exception_flags) |= float_flag_inexact;
1558
        aSign = extractFloat32Sign( a );
1559
        switch ( STATUS(float_rounding_mode) ) {
1560
         case float_round_nearest_even:
1561
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562
                return packFloat32( aSign, 0x7F, 0 );
1563
            }
1564
            break;
1565
         case float_round_down:
1566
            return make_float32(aSign ? 0xBF800000 : 0);
1567
         case float_round_up:
1568
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569
        }
1570
        return packFloat32( aSign, 0, 0 );
1571
    }
1572
    lastBitMask = 1;
1573
    lastBitMask <<= 0x96 - aExp;
1574
    roundBitsMask = lastBitMask - 1;
1575
    z = float32_val(a);
1576
    roundingMode = STATUS(float_rounding_mode);
1577
    if ( roundingMode == float_round_nearest_even ) {
1578
        z += lastBitMask>>1;
1579
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580
    }
1581
    else if ( roundingMode != float_round_to_zero ) {
1582
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583
            z += roundBitsMask;
1584
        }
1585
    }
1586
    z &= ~ roundBitsMask;
1587
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588
    return make_float32(z);
1589

    
1590
}
1591

    
1592
/*----------------------------------------------------------------------------
1593
| Returns the result of adding the absolute values of the single-precision
1594
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595
| before being returned.  `zSign' is ignored if the result is a NaN.
1596
| The addition is performed according to the IEC/IEEE Standard for Binary
1597
| Floating-Point Arithmetic.
1598
*----------------------------------------------------------------------------*/
1599

    
1600
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601
{
1602
    int16 aExp, bExp, zExp;
1603
    bits32 aSig, bSig, zSig;
1604
    int16 expDiff;
1605

    
1606
    aSig = extractFloat32Frac( a );
1607
    aExp = extractFloat32Exp( a );
1608
    bSig = extractFloat32Frac( b );
1609
    bExp = extractFloat32Exp( b );
1610
    expDiff = aExp - bExp;
1611
    aSig <<= 6;
1612
    bSig <<= 6;
1613
    if ( 0 < expDiff ) {
1614
        if ( aExp == 0xFF ) {
1615
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616
            return a;
1617
        }
1618
        if ( bExp == 0 ) {
1619
            --expDiff;
1620
        }
1621
        else {
1622
            bSig |= 0x20000000;
1623
        }
1624
        shift32RightJamming( bSig, expDiff, &bSig );
1625
        zExp = aExp;
1626
    }
1627
    else if ( expDiff < 0 ) {
1628
        if ( bExp == 0xFF ) {
1629
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630
            return packFloat32( zSign, 0xFF, 0 );
1631
        }
1632
        if ( aExp == 0 ) {
1633
            ++expDiff;
1634
        }
1635
        else {
1636
            aSig |= 0x20000000;
1637
        }
1638
        shift32RightJamming( aSig, - expDiff, &aSig );
1639
        zExp = bExp;
1640
    }
1641
    else {
1642
        if ( aExp == 0xFF ) {
1643
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644
            return a;
1645
        }
1646
        if ( aExp == 0 ) {
1647
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649
        }
1650
        zSig = 0x40000000 + aSig + bSig;
1651
        zExp = aExp;
1652
        goto roundAndPack;
1653
    }
1654
    aSig |= 0x20000000;
1655
    zSig = ( aSig + bSig )<<1;
1656
    --zExp;
1657
    if ( (sbits32) zSig < 0 ) {
1658
        zSig = aSig + bSig;
1659
        ++zExp;
1660
    }
1661
 roundAndPack:
1662
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663

    
1664
}
1665

    
1666
/*----------------------------------------------------------------------------
1667
| Returns the result of subtracting the absolute values of the single-
1668
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669
| difference is negated before being returned.  `zSign' is ignored if the
1670
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671
| Standard for Binary Floating-Point Arithmetic.
1672
*----------------------------------------------------------------------------*/
1673

    
1674
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675
{
1676
    int16 aExp, bExp, zExp;
1677
    bits32 aSig, bSig, zSig;
1678
    int16 expDiff;
1679

    
1680
    aSig = extractFloat32Frac( a );
1681
    aExp = extractFloat32Exp( a );
1682
    bSig = extractFloat32Frac( b );
1683
    bExp = extractFloat32Exp( b );
1684
    expDiff = aExp - bExp;
1685
    aSig <<= 7;
1686
    bSig <<= 7;
1687
    if ( 0 < expDiff ) goto aExpBigger;
1688
    if ( expDiff < 0 ) goto bExpBigger;
1689
    if ( aExp == 0xFF ) {
1690
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691
        float_raise( float_flag_invalid STATUS_VAR);
1692
        return float32_default_nan;
1693
    }
1694
    if ( aExp == 0 ) {
1695
        aExp = 1;
1696
        bExp = 1;
1697
    }
1698
    if ( bSig < aSig ) goto aBigger;
1699
    if ( aSig < bSig ) goto bBigger;
1700
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701
 bExpBigger:
1702
    if ( bExp == 0xFF ) {
1703
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1705
    }
1706
    if ( aExp == 0 ) {
1707
        ++expDiff;
1708
    }
1709
    else {
1710
        aSig |= 0x40000000;
1711
    }
1712
    shift32RightJamming( aSig, - expDiff, &aSig );
1713
    bSig |= 0x40000000;
1714
 bBigger:
1715
    zSig = bSig - aSig;
1716
    zExp = bExp;
1717
    zSign ^= 1;
1718
    goto normalizeRoundAndPack;
1719
 aExpBigger:
1720
    if ( aExp == 0xFF ) {
1721
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722
        return a;
1723
    }
1724
    if ( bExp == 0 ) {
1725
        --expDiff;
1726
    }
1727
    else {
1728
        bSig |= 0x40000000;
1729
    }
1730
    shift32RightJamming( bSig, expDiff, &bSig );
1731
    aSig |= 0x40000000;
1732
 aBigger:
1733
    zSig = aSig - bSig;
1734
    zExp = aExp;
1735
 normalizeRoundAndPack:
1736
    --zExp;
1737
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738

    
1739
}
1740

    
1741
/*----------------------------------------------------------------------------
1742
| Returns the result of adding the single-precision floating-point values `a'
1743
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744
| Binary Floating-Point Arithmetic.
1745
*----------------------------------------------------------------------------*/
1746

    
1747
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748
{
1749
    flag aSign, bSign;
1750

    
1751
    aSign = extractFloat32Sign( a );
1752
    bSign = extractFloat32Sign( b );
1753
    if ( aSign == bSign ) {
1754
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755
    }
1756
    else {
1757
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758
    }
1759

    
1760
}
1761

    
1762
/*----------------------------------------------------------------------------
1763
| Returns the result of subtracting the single-precision floating-point values
1764
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765
| for Binary Floating-Point Arithmetic.
1766
*----------------------------------------------------------------------------*/
1767

    
1768
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769
{
1770
    flag aSign, bSign;
1771

    
1772
    aSign = extractFloat32Sign( a );
1773
    bSign = extractFloat32Sign( b );
1774
    if ( aSign == bSign ) {
1775
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776
    }
1777
    else {
1778
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779
    }
1780

    
1781
}
1782

    
1783
/*----------------------------------------------------------------------------
1784
| Returns the result of multiplying the single-precision floating-point values
1785
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786
| for Binary Floating-Point Arithmetic.
1787
*----------------------------------------------------------------------------*/
1788

    
1789
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790
{
1791
    flag aSign, bSign, zSign;
1792
    int16 aExp, bExp, zExp;
1793
    bits32 aSig, bSig;
1794
    bits64 zSig64;
1795
    bits32 zSig;
1796

    
1797
    aSig = extractFloat32Frac( a );
1798
    aExp = extractFloat32Exp( a );
1799
    aSign = extractFloat32Sign( a );
1800
    bSig = extractFloat32Frac( b );
1801
    bExp = extractFloat32Exp( b );
1802
    bSign = extractFloat32Sign( b );
1803
    zSign = aSign ^ bSign;
1804
    if ( aExp == 0xFF ) {
1805
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806
            return propagateFloat32NaN( a, b STATUS_VAR );
1807
        }
1808
        if ( ( bExp | bSig ) == 0 ) {
1809
            float_raise( float_flag_invalid STATUS_VAR);
1810
            return float32_default_nan;
1811
        }
1812
        return packFloat32( zSign, 0xFF, 0 );
1813
    }
1814
    if ( bExp == 0xFF ) {
1815
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816
        if ( ( aExp | aSig ) == 0 ) {
1817
            float_raise( float_flag_invalid STATUS_VAR);
1818
            return float32_default_nan;
1819
        }
1820
        return packFloat32( zSign, 0xFF, 0 );
1821
    }
1822
    if ( aExp == 0 ) {
1823
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825
    }
1826
    if ( bExp == 0 ) {
1827
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829
    }
1830
    zExp = aExp + bExp - 0x7F;
1831
    aSig = ( aSig | 0x00800000 )<<7;
1832
    bSig = ( bSig | 0x00800000 )<<8;
1833
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834
    zSig = zSig64;
1835
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836
        zSig <<= 1;
1837
        --zExp;
1838
    }
1839
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840

    
1841
}
1842

    
1843
/*----------------------------------------------------------------------------
1844
| Returns the result of dividing the single-precision floating-point value `a'
1845
| by the corresponding value `b'.  The operation is performed according to the
1846
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847
*----------------------------------------------------------------------------*/
1848

    
1849
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850
{
1851
    flag aSign, bSign, zSign;
1852
    int16 aExp, bExp, zExp;
1853
    bits32 aSig, bSig, zSig;
1854

    
1855
    aSig = extractFloat32Frac( a );
1856
    aExp = extractFloat32Exp( a );
1857
    aSign = extractFloat32Sign( a );
1858
    bSig = extractFloat32Frac( b );
1859
    bExp = extractFloat32Exp( b );
1860
    bSign = extractFloat32Sign( b );
1861
    zSign = aSign ^ bSign;
1862
    if ( aExp == 0xFF ) {
1863
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864
        if ( bExp == 0xFF ) {
1865
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866
            float_raise( float_flag_invalid STATUS_VAR);
1867
            return float32_default_nan;
1868
        }
1869
        return packFloat32( zSign, 0xFF, 0 );
1870
    }
1871
    if ( bExp == 0xFF ) {
1872
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873
        return packFloat32( zSign, 0, 0 );
1874
    }
1875
    if ( bExp == 0 ) {
1876
        if ( bSig == 0 ) {
1877
            if ( ( aExp | aSig ) == 0 ) {
1878
                float_raise( float_flag_invalid STATUS_VAR);
1879
                return float32_default_nan;
1880
            }
1881
            float_raise( float_flag_divbyzero STATUS_VAR);
1882
            return packFloat32( zSign, 0xFF, 0 );
1883
        }
1884
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885
    }
1886
    if ( aExp == 0 ) {
1887
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889
    }
1890
    zExp = aExp - bExp + 0x7D;
1891
    aSig = ( aSig | 0x00800000 )<<7;
1892
    bSig = ( bSig | 0x00800000 )<<8;
1893
    if ( bSig <= ( aSig + aSig ) ) {
1894
        aSig >>= 1;
1895
        ++zExp;
1896
    }
1897
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898
    if ( ( zSig & 0x3F ) == 0 ) {
1899
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900
    }
1901
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902

    
1903
}
1904

    
1905
/*----------------------------------------------------------------------------
1906
| Returns the remainder of the single-precision floating-point value `a'
1907
| with respect to the corresponding value `b'.  The operation is performed
1908
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909
*----------------------------------------------------------------------------*/
1910

    
1911
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912
{
1913
    flag aSign, zSign;
1914
    int16 aExp, bExp, expDiff;
1915
    bits32 aSig, bSig;
1916
    bits32 q;
1917
    bits64 aSig64, bSig64, q64;
1918
    bits32 alternateASig;
1919
    sbits32 sigMean;
1920

    
1921
    aSig = extractFloat32Frac( a );
1922
    aExp = extractFloat32Exp( a );
1923
    aSign = extractFloat32Sign( a );
1924
    bSig = extractFloat32Frac( b );
1925
    bExp = extractFloat32Exp( b );
1926
    if ( aExp == 0xFF ) {
1927
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1928
            return propagateFloat32NaN( a, b STATUS_VAR );
1929
        }
1930
        float_raise( float_flag_invalid STATUS_VAR);
1931
        return float32_default_nan;
1932
    }
1933
    if ( bExp == 0xFF ) {
1934
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935
        return a;
1936
    }
1937
    if ( bExp == 0 ) {
1938
        if ( bSig == 0 ) {
1939
            float_raise( float_flag_invalid STATUS_VAR);
1940
            return float32_default_nan;
1941
        }
1942
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943
    }
1944
    if ( aExp == 0 ) {
1945
        if ( aSig == 0 ) return a;
1946
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1947
    }
1948
    expDiff = aExp - bExp;
1949
    aSig |= 0x00800000;
1950
    bSig |= 0x00800000;
1951
    if ( expDiff < 32 ) {
1952
        aSig <<= 8;
1953
        bSig <<= 8;
1954
        if ( expDiff < 0 ) {
1955
            if ( expDiff < -1 ) return a;
1956
            aSig >>= 1;
1957
        }
1958
        q = ( bSig <= aSig );
1959
        if ( q ) aSig -= bSig;
1960
        if ( 0 < expDiff ) {
1961
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1962
            q >>= 32 - expDiff;
1963
            bSig >>= 2;
1964
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1965
        }
1966
        else {
1967
            aSig >>= 2;
1968
            bSig >>= 2;
1969
        }
1970
    }
1971
    else {
1972
        if ( bSig <= aSig ) aSig -= bSig;
1973
        aSig64 = ( (bits64) aSig )<<40;
1974
        bSig64 = ( (bits64) bSig )<<40;
1975
        expDiff -= 64;
1976
        while ( 0 < expDiff ) {
1977
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979
            aSig64 = - ( ( bSig * q64 )<<38 );
1980
            expDiff -= 62;
1981
        }
1982
        expDiff += 64;
1983
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985
        q = q64>>( 64 - expDiff );
1986
        bSig <<= 6;
1987
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1988
    }
1989
    do {
1990
        alternateASig = aSig;
1991
        ++q;
1992
        aSig -= bSig;
1993
    } while ( 0 <= (sbits32) aSig );
1994
    sigMean = aSig + alternateASig;
1995
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996
        aSig = alternateASig;
1997
    }
1998
    zSign = ( (sbits32) aSig < 0 );
1999
    if ( zSign ) aSig = - aSig;
2000
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2001

    
2002
}
2003

    
2004
/*----------------------------------------------------------------------------
2005
| Returns the square root of the single-precision floating-point value `a'.
2006
| The operation is performed according to the IEC/IEEE Standard for Binary
2007
| Floating-Point Arithmetic.
2008
*----------------------------------------------------------------------------*/
2009

    
2010
float32 float32_sqrt( float32 a STATUS_PARAM )
2011
{
2012
    flag aSign;
2013
    int16 aExp, zExp;
2014
    bits32 aSig, zSig;
2015
    bits64 rem, term;
2016

    
2017
    aSig = extractFloat32Frac( a );
2018
    aExp = extractFloat32Exp( a );
2019
    aSign = extractFloat32Sign( a );
2020
    if ( aExp == 0xFF ) {
2021
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022
        if ( ! aSign ) return a;
2023
        float_raise( float_flag_invalid STATUS_VAR);
2024
        return float32_default_nan;
2025
    }
2026
    if ( aSign ) {
2027
        if ( ( aExp | aSig ) == 0 ) return a;
2028
        float_raise( float_flag_invalid STATUS_VAR);
2029
        return float32_default_nan;
2030
    }
2031
    if ( aExp == 0 ) {
2032
        if ( aSig == 0 ) return float32_zero;
2033
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2034
    }
2035
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036
    aSig = ( aSig | 0x00800000 )<<8;
2037
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2038
    if ( ( zSig & 0x7F ) <= 5 ) {
2039
        if ( zSig < 2 ) {
2040
            zSig = 0x7FFFFFFF;
2041
            goto roundAndPack;
2042
        }
2043
        aSig >>= aExp & 1;
2044
        term = ( (bits64) zSig ) * zSig;
2045
        rem = ( ( (bits64) aSig )<<32 ) - term;
2046
        while ( (sbits64) rem < 0 ) {
2047
            --zSig;
2048
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2049
        }
2050
        zSig |= ( rem != 0 );
2051
    }
2052
    shift32RightJamming( zSig, 1, &zSig );
2053
 roundAndPack:
2054
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2055

    
2056
}
2057

    
2058
/*----------------------------------------------------------------------------
2059
| Returns the binary exponential of the single-precision floating-point value
2060
| `a'. The operation is performed according to the IEC/IEEE Standard for
2061
| Binary Floating-Point Arithmetic.
2062
|
2063
| Uses the following identities:
2064
|
2065
| 1. -------------------------------------------------------------------------
2066
|      x    x*ln(2)
2067
|     2  = e
2068
|
2069
| 2. -------------------------------------------------------------------------
2070
|                      2     3     4     5           n
2071
|      x        x     x     x     x     x           x
2072
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2073
|               1!    2!    3!    4!    5!          n!
2074
*----------------------------------------------------------------------------*/
2075

    
2076
static const float64 float32_exp2_coefficients[15] =
2077
{
2078
    make_float64( 0x3ff0000000000000ll ), /*  1 */
2079
    make_float64( 0x3fe0000000000000ll ), /*  2 */
2080
    make_float64( 0x3fc5555555555555ll ), /*  3 */
2081
    make_float64( 0x3fa5555555555555ll ), /*  4 */
2082
    make_float64( 0x3f81111111111111ll ), /*  5 */
2083
    make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2084
    make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2085
    make_float64( 0x3efa01a01a01a01all ), /*  8 */
2086
    make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2087
    make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2088
    make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2089
    make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2090
    make_float64( 0x3de6124613a86d09ll ), /* 13 */
2091
    make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2092
    make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2093
};
2094

    
2095
float32 float32_exp2( float32 a STATUS_PARAM )
2096
{
2097
    flag aSign;
2098
    int16 aExp;
2099
    bits32 aSig;
2100
    float64 r, x, xn;
2101
    int i;
2102

    
2103
    aSig = extractFloat32Frac( a );
2104
    aExp = extractFloat32Exp( a );
2105
    aSign = extractFloat32Sign( a );
2106

    
2107
    if ( aExp == 0xFF) {
2108
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2109
        return (aSign) ? float32_zero : a;
2110
    }
2111
    if (aExp == 0) {
2112
        if (aSig == 0) return float32_one;
2113
    }
2114

    
2115
    float_raise( float_flag_inexact STATUS_VAR);
2116

    
2117
    /* ******************************* */
2118
    /* using float64 for approximation */
2119
    /* ******************************* */
2120
    x = float32_to_float64(a STATUS_VAR);
2121
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2122

    
2123
    xn = x;
2124
    r = float64_one;
2125
    for (i = 0 ; i < 15 ; i++) {
2126
        float64 f;
2127

    
2128
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2129
        r = float64_add(r, f STATUS_VAR);
2130

    
2131
        xn = float64_mul(xn, x STATUS_VAR);
2132
    }
2133

    
2134
    return float64_to_float32(r, status);
2135
}
2136

    
2137
/*----------------------------------------------------------------------------
2138
| Returns the binary log of the single-precision floating-point value `a'.
2139
| The operation is performed according to the IEC/IEEE Standard for Binary
2140
| Floating-Point Arithmetic.
2141
*----------------------------------------------------------------------------*/
2142
float32 float32_log2( float32 a STATUS_PARAM )
2143
{
2144
    flag aSign, zSign;
2145
    int16 aExp;
2146
    bits32 aSig, zSig, i;
2147

    
2148
    aSig = extractFloat32Frac( a );
2149
    aExp = extractFloat32Exp( a );
2150
    aSign = extractFloat32Sign( a );
2151

    
2152
    if ( aExp == 0 ) {
2153
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2154
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2155
    }
2156
    if ( aSign ) {
2157
        float_raise( float_flag_invalid STATUS_VAR);
2158
        return float32_default_nan;
2159
    }
2160
    if ( aExp == 0xFF ) {
2161
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2162
        return a;
2163
    }
2164

    
2165
    aExp -= 0x7F;
2166
    aSig |= 0x00800000;
2167
    zSign = aExp < 0;
2168
    zSig = aExp << 23;
2169

    
2170
    for (i = 1 << 22; i > 0; i >>= 1) {
2171
        aSig = ( (bits64)aSig * aSig ) >> 23;
2172
        if ( aSig & 0x01000000 ) {
2173
            aSig >>= 1;
2174
            zSig |= i;
2175
        }
2176
    }
2177

    
2178
    if ( zSign )
2179
        zSig = -zSig;
2180

    
2181
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2182
}
2183

    
2184
/*----------------------------------------------------------------------------
2185
| Returns 1 if the single-precision floating-point value `a' is equal to
2186
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2187
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2188
*----------------------------------------------------------------------------*/
2189

    
2190
int float32_eq( float32 a, float32 b STATUS_PARAM )
2191
{
2192

    
2193
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2194
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2195
       ) {
2196
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2197
            float_raise( float_flag_invalid STATUS_VAR);
2198
        }
2199
        return 0;
2200
    }
2201
    return ( float32_val(a) == float32_val(b) ) ||
2202
            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2203

    
2204
}
2205

    
2206
/*----------------------------------------------------------------------------
2207
| Returns 1 if the single-precision floating-point value `a' is less than
2208
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2209
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2210
| Arithmetic.
2211
*----------------------------------------------------------------------------*/
2212

    
2213
int float32_le( float32 a, float32 b STATUS_PARAM )
2214
{
2215
    flag aSign, bSign;
2216
    bits32 av, bv;
2217

    
2218
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2219
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2220
       ) {
2221
        float_raise( float_flag_invalid STATUS_VAR);
2222
        return 0;
2223
    }
2224
    aSign = extractFloat32Sign( a );
2225
    bSign = extractFloat32Sign( b );
2226
    av = float32_val(a);
2227
    bv = float32_val(b);
2228
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2230

    
2231
}
2232

    
2233
/*----------------------------------------------------------------------------
2234
| Returns 1 if the single-precision floating-point value `a' is less than
2235
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2236
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237
*----------------------------------------------------------------------------*/
2238

    
2239
int float32_lt( float32 a, float32 b STATUS_PARAM )
2240
{
2241
    flag aSign, bSign;
2242
    bits32 av, bv;
2243

    
2244
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2246
       ) {
2247
        float_raise( float_flag_invalid STATUS_VAR);
2248
        return 0;
2249
    }
2250
    aSign = extractFloat32Sign( a );
2251
    bSign = extractFloat32Sign( b );
2252
    av = float32_val(a);
2253
    bv = float32_val(b);
2254
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2255
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2256

    
2257
}
2258

    
2259
/*----------------------------------------------------------------------------
2260
| Returns 1 if the single-precision floating-point value `a' is equal to
2261
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2262
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2263
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2264
*----------------------------------------------------------------------------*/
2265

    
2266
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2267
{
2268
    bits32 av, bv;
2269

    
2270
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2271
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2272
       ) {
2273
        float_raise( float_flag_invalid STATUS_VAR);
2274
        return 0;
2275
    }
2276
    av = float32_val(a);
2277
    bv = float32_val(b);
2278
    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2279

    
2280
}
2281

    
2282
/*----------------------------------------------------------------------------
2283
| Returns 1 if the single-precision floating-point value `a' is less than or
2284
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2285
| cause an exception.  Otherwise, the comparison is performed according to the
2286
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287
*----------------------------------------------------------------------------*/
2288

    
2289
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2290
{
2291
    flag aSign, bSign;
2292
    bits32 av, bv;
2293

    
2294
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2296
       ) {
2297
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2298
            float_raise( float_flag_invalid STATUS_VAR);
2299
        }
2300
        return 0;
2301
    }
2302
    aSign = extractFloat32Sign( a );
2303
    bSign = extractFloat32Sign( b );
2304
    av = float32_val(a);
2305
    bv = float32_val(b);
2306
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2307
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2308

    
2309
}
2310

    
2311
/*----------------------------------------------------------------------------
2312
| Returns 1 if the single-precision floating-point value `a' is less than
2313
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2314
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2315
| Standard for Binary Floating-Point Arithmetic.
2316
*----------------------------------------------------------------------------*/
2317

    
2318
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2319
{
2320
    flag aSign, bSign;
2321
    bits32 av, bv;
2322

    
2323
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2324
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2325
       ) {
2326
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2327
            float_raise( float_flag_invalid STATUS_VAR);
2328
        }
2329
        return 0;
2330
    }
2331
    aSign = extractFloat32Sign( a );
2332
    bSign = extractFloat32Sign( b );
2333
    av = float32_val(a);
2334
    bv = float32_val(b);
2335
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2336
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2337

    
2338
}
2339

    
2340
/*----------------------------------------------------------------------------
2341
| Returns the result of converting the double-precision floating-point value
2342
| `a' to the 32-bit two's complement integer format.  The conversion is
2343
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2344
| Arithmetic---which means in particular that the conversion is rounded
2345
| according to the current rounding mode.  If `a' is a NaN, the largest
2346
| positive integer is returned.  Otherwise, if the conversion overflows, the
2347
| largest integer with the same sign as `a' is returned.
2348
*----------------------------------------------------------------------------*/
2349

    
2350
int32 float64_to_int32( float64 a STATUS_PARAM )
2351
{
2352
    flag aSign;
2353
    int16 aExp, shiftCount;
2354
    bits64 aSig;
2355

    
2356
    aSig = extractFloat64Frac( a );
2357
    aExp = extractFloat64Exp( a );
2358
    aSign = extractFloat64Sign( a );
2359
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2360
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2361
    shiftCount = 0x42C - aExp;
2362
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2363
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2364

    
2365
}
2366

    
2367
/*----------------------------------------------------------------------------
2368
| Returns the result of converting the double-precision floating-point value
2369
| `a' to the 32-bit two's complement integer format.  The conversion is
2370
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2371
| Arithmetic, except that the conversion is always rounded toward zero.
2372
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2373
| the conversion overflows, the largest integer with the same sign as `a' is
2374
| returned.
2375
*----------------------------------------------------------------------------*/
2376

    
2377
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2378
{
2379
    flag aSign;
2380
    int16 aExp, shiftCount;
2381
    bits64 aSig, savedASig;
2382
    int32 z;
2383

    
2384
    aSig = extractFloat64Frac( a );
2385
    aExp = extractFloat64Exp( a );
2386
    aSign = extractFloat64Sign( a );
2387
    if ( 0x41E < aExp ) {
2388
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2389
        goto invalid;
2390
    }
2391
    else if ( aExp < 0x3FF ) {
2392
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2393
        return 0;
2394
    }
2395
    aSig |= LIT64( 0x0010000000000000 );
2396
    shiftCount = 0x433 - aExp;
2397
    savedASig = aSig;
2398
    aSig >>= shiftCount;
2399
    z = aSig;
2400
    if ( aSign ) z = - z;
2401
    if ( ( z < 0 ) ^ aSign ) {
2402
 invalid:
2403
        float_raise( float_flag_invalid STATUS_VAR);
2404
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2405
    }
2406
    if ( ( aSig<<shiftCount ) != savedASig ) {
2407
        STATUS(float_exception_flags) |= float_flag_inexact;
2408
    }
2409
    return z;
2410

    
2411
}
2412

    
2413
/*----------------------------------------------------------------------------
2414
| Returns the result of converting the double-precision floating-point value
2415
| `a' to the 64-bit two's complement integer format.  The conversion is
2416
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2417
| Arithmetic---which means in particular that the conversion is rounded
2418
| according to the current rounding mode.  If `a' is a NaN, the largest
2419
| positive integer is returned.  Otherwise, if the conversion overflows, the
2420
| largest integer with the same sign as `a' is returned.
2421
*----------------------------------------------------------------------------*/
2422

    
2423
int64 float64_to_int64( float64 a STATUS_PARAM )
2424
{
2425
    flag aSign;
2426
    int16 aExp, shiftCount;
2427
    bits64 aSig, aSigExtra;
2428

    
2429
    aSig = extractFloat64Frac( a );
2430
    aExp = extractFloat64Exp( a );
2431
    aSign = extractFloat64Sign( a );
2432
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2433
    shiftCount = 0x433 - aExp;
2434
    if ( shiftCount <= 0 ) {
2435
        if ( 0x43E < aExp ) {
2436
            float_raise( float_flag_invalid STATUS_VAR);
2437
            if (    ! aSign
2438
                 || (    ( aExp == 0x7FF )
2439
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2440
               ) {
2441
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2442
            }
2443
            return (sbits64) LIT64( 0x8000000000000000 );
2444
        }
2445
        aSigExtra = 0;
2446
        aSig <<= - shiftCount;
2447
    }
2448
    else {
2449
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2450
    }
2451
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2452

    
2453
}
2454

    
2455
/*----------------------------------------------------------------------------
2456
| Returns the result of converting the double-precision floating-point value
2457
| `a' to the 64-bit two's complement integer format.  The conversion is
2458
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2459
| Arithmetic, except that the conversion is always rounded toward zero.
2460
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2461
| the conversion overflows, the largest integer with the same sign as `a' is
2462
| returned.
2463
*----------------------------------------------------------------------------*/
2464

    
2465
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2466
{
2467
    flag aSign;
2468
    int16 aExp, shiftCount;
2469
    bits64 aSig;
2470
    int64 z;
2471

    
2472
    aSig = extractFloat64Frac( a );
2473
    aExp = extractFloat64Exp( a );
2474
    aSign = extractFloat64Sign( a );
2475
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2476
    shiftCount = aExp - 0x433;
2477
    if ( 0 <= shiftCount ) {
2478
        if ( 0x43E <= aExp ) {
2479
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2480
                float_raise( float_flag_invalid STATUS_VAR);
2481
                if (    ! aSign
2482
                     || (    ( aExp == 0x7FF )
2483
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2484
                   ) {
2485
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2486
                }
2487
            }
2488
            return (sbits64) LIT64( 0x8000000000000000 );
2489
        }
2490
        z = aSig<<shiftCount;
2491
    }
2492
    else {
2493
        if ( aExp < 0x3FE ) {
2494
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2495
            return 0;
2496
        }
2497
        z = aSig>>( - shiftCount );
2498
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2499
            STATUS(float_exception_flags) |= float_flag_inexact;
2500
        }
2501
    }
2502
    if ( aSign ) z = - z;
2503
    return z;
2504

    
2505
}
2506

    
2507
/*----------------------------------------------------------------------------
2508
| Returns the result of converting the double-precision floating-point value
2509
| `a' to the single-precision floating-point format.  The conversion is
2510
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2511
| Arithmetic.
2512
*----------------------------------------------------------------------------*/
2513

    
2514
float32 float64_to_float32( float64 a STATUS_PARAM )
2515
{
2516
    flag aSign;
2517
    int16 aExp;
2518
    bits64 aSig;
2519
    bits32 zSig;
2520

    
2521
    aSig = extractFloat64Frac( a );
2522
    aExp = extractFloat64Exp( a );
2523
    aSign = extractFloat64Sign( a );
2524
    if ( aExp == 0x7FF ) {
2525
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2526
        return packFloat32( aSign, 0xFF, 0 );
2527
    }
2528
    shift64RightJamming( aSig, 22, &aSig );
2529
    zSig = aSig;
2530
    if ( aExp || zSig ) {
2531
        zSig |= 0x40000000;
2532
        aExp -= 0x381;
2533
    }
2534
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2535

    
2536
}
2537

    
2538

    
2539
/*----------------------------------------------------------------------------
2540
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2541
| half-precision floating-point value, returning the result.  After being
2542
| shifted into the proper positions, the three fields are simply added
2543
| together to form the result.  This means that any integer portion of `zSig'
2544
| will be added into the exponent.  Since a properly normalized significand
2545
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2546
| than the desired result exponent whenever `zSig' is a complete, normalized
2547
| significand.
2548
*----------------------------------------------------------------------------*/
2549
static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2550
{
2551
    return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2552
}
2553

    
2554
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2555
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2556
  
2557
float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2558
{
2559
    flag aSign;
2560
    int16 aExp;
2561
    bits32 aSig;
2562

    
2563
    aSign = a >> 15;
2564
    aExp = (a >> 10) & 0x1f;
2565
    aSig = a & 0x3ff;
2566

    
2567
    if (aExp == 0x1f && ieee) {
2568
        if (aSig) {
2569
            /* Make sure correct exceptions are raised.  */
2570
            float32ToCommonNaN(a STATUS_VAR);
2571
            aSig |= 0x200;
2572
        }
2573
        return packFloat32(aSign, 0xff, aSig << 13);
2574
    }
2575
    if (aExp == 0) {
2576
        int8 shiftCount;
2577

    
2578
        if (aSig == 0) {
2579
            return packFloat32(aSign, 0, 0);
2580
        }
2581

    
2582
        shiftCount = countLeadingZeros32( aSig ) - 21;
2583
        aSig = aSig << shiftCount;
2584
        aExp = -shiftCount;
2585
    }
2586
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2587
}
2588

    
2589
bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2590
{
2591
    flag aSign;
2592
    int16 aExp;
2593
    bits32 aSig;
2594
    bits32 mask;
2595
    bits32 increment;
2596
    int8 roundingMode;
2597

    
2598
    aSig = extractFloat32Frac( a );
2599
    aExp = extractFloat32Exp( a );
2600
    aSign = extractFloat32Sign( a );
2601
    if ( aExp == 0xFF ) {
2602
        if (aSig) {
2603
            /* Make sure correct exceptions are raised.  */
2604
            float32ToCommonNaN(a STATUS_VAR);
2605
            aSig |= 0x00400000;
2606
        }
2607
        return packFloat16(aSign, 0x1f, aSig >> 13);
2608
    }
2609
    if (aExp == 0 && aSign == 0) {
2610
        return packFloat16(aSign, 0, 0);
2611
    }
2612
    /* Decimal point between bits 22 and 23.  */
2613
    aSig |= 0x00800000;
2614
    aExp -= 0x7f;
2615
    if (aExp < -14) {
2616
        mask = 0x007fffff;
2617
        if (aExp < -24) {
2618
            aExp = -25;
2619
        } else {
2620
            mask >>= 24 + aExp;
2621
        }
2622
    } else {
2623
        mask = 0x00001fff;
2624
    }
2625
    if (aSig & mask) {
2626
        float_raise( float_flag_underflow STATUS_VAR );
2627
        roundingMode = STATUS(float_rounding_mode);
2628
        switch (roundingMode) {
2629
        case float_round_nearest_even:
2630
            increment = (mask + 1) >> 1;
2631
            if ((aSig & mask) == increment) {
2632
                increment = aSig & (increment << 1);
2633
            }
2634
            break;
2635
        case float_round_up:
2636
            increment = aSign ? 0 : mask;
2637
            break;
2638
        case float_round_down:
2639
            increment = aSign ? mask : 0;
2640
            break;
2641
        default: /* round_to_zero */
2642
            increment = 0;
2643
            break;
2644
        }
2645
        aSig += increment;
2646
        if (aSig >= 0x01000000) {
2647
            aSig >>= 1;
2648
            aExp++;
2649
        }
2650
    } else if (aExp < -14
2651
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2652
        float_raise( float_flag_underflow STATUS_VAR);
2653
    }
2654

    
2655
    if (ieee) {
2656
        if (aExp > 15) {
2657
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2658
            return packFloat16(aSign, 0x1f, 0);
2659
        }
2660
    } else {
2661
        if (aExp > 16) {
2662
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2663
            return packFloat16(aSign, 0x1f, 0x3ff);
2664
        }
2665
    }
2666
    if (aExp < -24) {
2667
        return packFloat16(aSign, 0, 0);
2668
    }
2669
    if (aExp < -14) {
2670
        aSig >>= -14 - aExp;
2671
        aExp = -14;
2672
    }
2673
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2674
}
2675

    
2676
#ifdef FLOATX80
2677

    
2678
/*----------------------------------------------------------------------------
2679
| Returns the result of converting the double-precision floating-point value
2680
| `a' to the extended double-precision floating-point format.  The conversion
2681
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2682
| Arithmetic.
2683
*----------------------------------------------------------------------------*/
2684

    
2685
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2686
{
2687
    flag aSign;
2688
    int16 aExp;
2689
    bits64 aSig;
2690

    
2691
    aSig = extractFloat64Frac( a );
2692
    aExp = extractFloat64Exp( a );
2693
    aSign = extractFloat64Sign( a );
2694
    if ( aExp == 0x7FF ) {
2695
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2696
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2697
    }
2698
    if ( aExp == 0 ) {
2699
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2700
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2701
    }
2702
    return
2703
        packFloatx80(
2704
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2705

    
2706
}
2707

    
2708
#endif
2709

    
2710
#ifdef FLOAT128
2711

    
2712
/*----------------------------------------------------------------------------
2713
| Returns the result of converting the double-precision floating-point value
2714
| `a' to the quadruple-precision floating-point format.  The conversion is
2715
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2716
| Arithmetic.
2717
*----------------------------------------------------------------------------*/
2718

    
2719
float128 float64_to_float128( float64 a STATUS_PARAM )
2720
{
2721
    flag aSign;
2722
    int16 aExp;
2723
    bits64 aSig, zSig0, zSig1;
2724

    
2725
    aSig = extractFloat64Frac( a );
2726
    aExp = extractFloat64Exp( a );
2727
    aSign = extractFloat64Sign( a );
2728
    if ( aExp == 0x7FF ) {
2729
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2730
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2731
    }
2732
    if ( aExp == 0 ) {
2733
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2734
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2735
        --aExp;
2736
    }
2737
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2738
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2739

    
2740
}
2741

    
2742
#endif
2743

    
2744
/*----------------------------------------------------------------------------
2745
| Rounds the double-precision floating-point value `a' to an integer, and
2746
| returns the result as a double-precision floating-point value.  The
2747
| operation is performed according to the IEC/IEEE Standard for Binary
2748
| Floating-Point Arithmetic.
2749
*----------------------------------------------------------------------------*/
2750

    
2751
float64 float64_round_to_int( float64 a STATUS_PARAM )
2752
{
2753
    flag aSign;
2754
    int16 aExp;
2755
    bits64 lastBitMask, roundBitsMask;
2756
    int8 roundingMode;
2757
    bits64 z;
2758

    
2759
    aExp = extractFloat64Exp( a );
2760
    if ( 0x433 <= aExp ) {
2761
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2762
            return propagateFloat64NaN( a, a STATUS_VAR );
2763
        }
2764
        return a;
2765
    }
2766
    if ( aExp < 0x3FF ) {
2767
        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2768
        STATUS(float_exception_flags) |= float_flag_inexact;
2769
        aSign = extractFloat64Sign( a );
2770
        switch ( STATUS(float_rounding_mode) ) {
2771
         case float_round_nearest_even:
2772
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2773
                return packFloat64( aSign, 0x3FF, 0 );
2774
            }
2775
            break;
2776
         case float_round_down:
2777
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2778
         case float_round_up:
2779
            return make_float64(
2780
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2781
        }
2782
        return packFloat64( aSign, 0, 0 );
2783
    }
2784
    lastBitMask = 1;
2785
    lastBitMask <<= 0x433 - aExp;
2786
    roundBitsMask = lastBitMask - 1;
2787
    z = float64_val(a);
2788
    roundingMode = STATUS(float_rounding_mode);
2789
    if ( roundingMode == float_round_nearest_even ) {
2790
        z += lastBitMask>>1;
2791
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2792
    }
2793
    else if ( roundingMode != float_round_to_zero ) {
2794
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2795
            z += roundBitsMask;
2796
        }
2797
    }
2798
    z &= ~ roundBitsMask;
2799
    if ( z != float64_val(a) )
2800
        STATUS(float_exception_flags) |= float_flag_inexact;
2801
    return make_float64(z);
2802

    
2803
}
2804

    
2805
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2806
{
2807
    int oldmode;
2808
    float64 res;
2809
    oldmode = STATUS(float_rounding_mode);
2810
    STATUS(float_rounding_mode) = float_round_to_zero;
2811
    res = float64_round_to_int(a STATUS_VAR);
2812
    STATUS(float_rounding_mode) = oldmode;
2813
    return res;
2814
}
2815

    
2816
/*----------------------------------------------------------------------------
2817
| Returns the result of adding the absolute values of the double-precision
2818
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2819
| before being returned.  `zSign' is ignored if the result is a NaN.
2820
| The addition is performed according to the IEC/IEEE Standard for Binary
2821
| Floating-Point Arithmetic.
2822
*----------------------------------------------------------------------------*/
2823

    
2824
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2825
{
2826
    int16 aExp, bExp, zExp;
2827
    bits64 aSig, bSig, zSig;
2828
    int16 expDiff;
2829

    
2830
    aSig = extractFloat64Frac( a );
2831
    aExp = extractFloat64Exp( a );
2832
    bSig = extractFloat64Frac( b );
2833
    bExp = extractFloat64Exp( b );
2834
    expDiff = aExp - bExp;
2835
    aSig <<= 9;
2836
    bSig <<= 9;
2837
    if ( 0 < expDiff ) {
2838
        if ( aExp == 0x7FF ) {
2839
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2840
            return a;
2841
        }
2842
        if ( bExp == 0 ) {
2843
            --expDiff;
2844
        }
2845
        else {
2846
            bSig |= LIT64( 0x2000000000000000 );
2847
        }
2848
        shift64RightJamming( bSig, expDiff, &bSig );
2849
        zExp = aExp;
2850
    }
2851
    else if ( expDiff < 0 ) {
2852
        if ( bExp == 0x7FF ) {
2853
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2854
            return packFloat64( zSign, 0x7FF, 0 );
2855
        }
2856
        if ( aExp == 0 ) {
2857
            ++expDiff;
2858
        }
2859
        else {
2860
            aSig |= LIT64( 0x2000000000000000 );
2861
        }
2862
        shift64RightJamming( aSig, - expDiff, &aSig );
2863
        zExp = bExp;
2864
    }
2865
    else {
2866
        if ( aExp == 0x7FF ) {
2867
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2868
            return a;
2869
        }
2870
        if ( aExp == 0 ) {
2871
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2872
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2873
        }
2874
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2875
        zExp = aExp;
2876
        goto roundAndPack;
2877
    }
2878
    aSig |= LIT64( 0x2000000000000000 );
2879
    zSig = ( aSig + bSig )<<1;
2880
    --zExp;
2881
    if ( (sbits64) zSig < 0 ) {
2882
        zSig = aSig + bSig;
2883
        ++zExp;
2884
    }
2885
 roundAndPack:
2886
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2887

    
2888
}
2889

    
2890
/*----------------------------------------------------------------------------
2891
| Returns the result of subtracting the absolute values of the double-
2892
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2893
| difference is negated before being returned.  `zSign' is ignored if the
2894
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2895
| Standard for Binary Floating-Point Arithmetic.
2896
*----------------------------------------------------------------------------*/
2897

    
2898
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2899
{
2900
    int16 aExp, bExp, zExp;
2901
    bits64 aSig, bSig, zSig;
2902
    int16 expDiff;
2903

    
2904
    aSig = extractFloat64Frac( a );
2905
    aExp = extractFloat64Exp( a );
2906
    bSig = extractFloat64Frac( b );
2907
    bExp = extractFloat64Exp( b );
2908
    expDiff = aExp - bExp;
2909
    aSig <<= 10;
2910
    bSig <<= 10;
2911
    if ( 0 < expDiff ) goto aExpBigger;
2912
    if ( expDiff < 0 ) goto bExpBigger;
2913
    if ( aExp == 0x7FF ) {
2914
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2915
        float_raise( float_flag_invalid STATUS_VAR);
2916
        return float64_default_nan;
2917
    }
2918
    if ( aExp == 0 ) {
2919
        aExp = 1;
2920
        bExp = 1;
2921
    }
2922
    if ( bSig < aSig ) goto aBigger;
2923
    if ( aSig < bSig ) goto bBigger;
2924
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2925
 bExpBigger:
2926
    if ( bExp == 0x7FF ) {
2927
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2928
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2929
    }
2930
    if ( aExp == 0 ) {
2931
        ++expDiff;
2932
    }
2933
    else {
2934
        aSig |= LIT64( 0x4000000000000000 );
2935
    }
2936
    shift64RightJamming( aSig, - expDiff, &aSig );
2937
    bSig |= LIT64( 0x4000000000000000 );
2938
 bBigger:
2939
    zSig = bSig - aSig;
2940
    zExp = bExp;
2941
    zSign ^= 1;
2942
    goto normalizeRoundAndPack;
2943
 aExpBigger:
2944
    if ( aExp == 0x7FF ) {
2945
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2946
        return a;
2947
    }
2948
    if ( bExp == 0 ) {
2949
        --expDiff;
2950
    }
2951
    else {
2952
        bSig |= LIT64( 0x4000000000000000 );
2953
    }
2954
    shift64RightJamming( bSig, expDiff, &bSig );
2955
    aSig |= LIT64( 0x4000000000000000 );
2956
 aBigger:
2957
    zSig = aSig - bSig;
2958
    zExp = aExp;
2959
 normalizeRoundAndPack:
2960
    --zExp;
2961
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2962

    
2963
}
2964

    
2965
/*----------------------------------------------------------------------------
2966
| Returns the result of adding the double-precision floating-point values `a'
2967
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2968
| Binary Floating-Point Arithmetic.
2969
*----------------------------------------------------------------------------*/
2970

    
2971
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2972
{
2973
    flag aSign, bSign;
2974

    
2975
    aSign = extractFloat64Sign( a );
2976
    bSign = extractFloat64Sign( b );
2977
    if ( aSign == bSign ) {
2978
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2979
    }
2980
    else {
2981
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2982
    }
2983

    
2984
}
2985

    
2986
/*----------------------------------------------------------------------------
2987
| Returns the result of subtracting the double-precision floating-point values
2988
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2989
| for Binary Floating-Point Arithmetic.
2990
*----------------------------------------------------------------------------*/
2991

    
2992
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2993
{
2994
    flag aSign, bSign;
2995

    
2996
    aSign = extractFloat64Sign( a );
2997
    bSign = extractFloat64Sign( b );
2998
    if ( aSign == bSign ) {
2999
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3000
    }
3001
    else {
3002
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3003
    }
3004

    
3005
}
3006

    
3007
/*----------------------------------------------------------------------------
3008
| Returns the result of multiplying the double-precision floating-point values
3009
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3010
| for Binary Floating-Point Arithmetic.
3011
*----------------------------------------------------------------------------*/
3012

    
3013
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3014
{
3015
    flag aSign, bSign, zSign;
3016
    int16 aExp, bExp, zExp;
3017
    bits64 aSig, bSig, zSig0, zSig1;
3018

    
3019
    aSig = extractFloat64Frac( a );
3020
    aExp = extractFloat64Exp( a );
3021
    aSign = extractFloat64Sign( a );
3022
    bSig = extractFloat64Frac( b );
3023
    bExp = extractFloat64Exp( b );
3024
    bSign = extractFloat64Sign( b );
3025
    zSign = aSign ^ bSign;
3026
    if ( aExp == 0x7FF ) {
3027
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3028
            return propagateFloat64NaN( a, b STATUS_VAR );
3029
        }
3030
        if ( ( bExp | bSig ) == 0 ) {
3031
            float_raise( float_flag_invalid STATUS_VAR);
3032
            return float64_default_nan;
3033
        }
3034
        return packFloat64( zSign, 0x7FF, 0 );
3035
    }
3036
    if ( bExp == 0x7FF ) {
3037
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3038
        if ( ( aExp | aSig ) == 0 ) {
3039
            float_raise( float_flag_invalid STATUS_VAR);
3040
            return float64_default_nan;
3041
        }
3042
        return packFloat64( zSign, 0x7FF, 0 );
3043
    }
3044
    if ( aExp == 0 ) {
3045
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3046
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3047
    }
3048
    if ( bExp == 0 ) {
3049
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3050
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3051
    }
3052
    zExp = aExp + bExp - 0x3FF;
3053
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3054
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3055
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3056
    zSig0 |= ( zSig1 != 0 );
3057
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3058
        zSig0 <<= 1;
3059
        --zExp;
3060
    }
3061
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3062

    
3063
}
3064

    
3065
/*----------------------------------------------------------------------------
3066
| Returns the result of dividing the double-precision floating-point value `a'
3067
| by the corresponding value `b'.  The operation is performed according to
3068
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3069
*----------------------------------------------------------------------------*/
3070

    
3071
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3072
{
3073
    flag aSign, bSign, zSign;
3074
    int16 aExp, bExp, zExp;
3075
    bits64 aSig, bSig, zSig;
3076
    bits64 rem0, rem1;
3077
    bits64 term0, term1;
3078

    
3079
    aSig = extractFloat64Frac( a );
3080
    aExp = extractFloat64Exp( a );
3081
    aSign = extractFloat64Sign( a );
3082
    bSig = extractFloat64Frac( b );
3083
    bExp = extractFloat64Exp( b );
3084
    bSign = extractFloat64Sign( b );
3085
    zSign = aSign ^ bSign;
3086
    if ( aExp == 0x7FF ) {
3087
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3088
        if ( bExp == 0x7FF ) {
3089
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090
            float_raise( float_flag_invalid STATUS_VAR);
3091
            return float64_default_nan;
3092
        }
3093
        return packFloat64( zSign, 0x7FF, 0 );
3094
    }
3095
    if ( bExp == 0x7FF ) {
3096
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3097
        return packFloat64( zSign, 0, 0 );
3098
    }
3099
    if ( bExp == 0 ) {
3100
        if ( bSig == 0 ) {
3101
            if ( ( aExp | aSig ) == 0 ) {
3102
                float_raise( float_flag_invalid STATUS_VAR);
3103
                return float64_default_nan;
3104
            }
3105
            float_raise( float_flag_divbyzero STATUS_VAR);
3106
            return packFloat64( zSign, 0x7FF, 0 );
3107
        }
3108
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3109
    }
3110
    if ( aExp == 0 ) {
3111
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3112
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3113
    }
3114
    zExp = aExp - bExp + 0x3FD;
3115
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3116
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3117
    if ( bSig <= ( aSig + aSig ) ) {
3118
        aSig >>= 1;
3119
        ++zExp;
3120
    }
3121
    zSig = estimateDiv128To64( aSig, 0, bSig );
3122
    if ( ( zSig & 0x1FF ) <= 2 ) {
3123
        mul64To128( bSig, zSig, &term0, &term1 );
3124
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3125
        while ( (sbits64) rem0 < 0 ) {
3126
            --zSig;
3127
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3128
        }
3129
        zSig |= ( rem1 != 0 );
3130
    }
3131
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3132

    
3133
}
3134

    
3135
/*----------------------------------------------------------------------------
3136
| Returns the remainder of the double-precision floating-point value `a'
3137
| with respect to the corresponding value `b'.  The operation is performed
3138
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3139
*----------------------------------------------------------------------------*/
3140

    
3141
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3142
{
3143
    flag aSign, zSign;
3144
    int16 aExp, bExp, expDiff;
3145
    bits64 aSig, bSig;
3146
    bits64 q, alternateASig;
3147
    sbits64 sigMean;
3148

    
3149
    aSig = extractFloat64Frac( a );
3150
    aExp = extractFloat64Exp( a );
3151
    aSign = extractFloat64Sign( a );
3152
    bSig = extractFloat64Frac( b );
3153
    bExp = extractFloat64Exp( b );
3154
    if ( aExp == 0x7FF ) {
3155
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3156
            return propagateFloat64NaN( a, b STATUS_VAR );
3157
        }
3158
        float_raise( float_flag_invalid STATUS_VAR);
3159
        return float64_default_nan;
3160
    }
3161
    if ( bExp == 0x7FF ) {
3162
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3163
        return a;
3164
    }
3165
    if ( bExp == 0 ) {
3166
        if ( bSig == 0 ) {
3167
            float_raise( float_flag_invalid STATUS_VAR);
3168
            return float64_default_nan;
3169
        }
3170
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3171
    }
3172
    if ( aExp == 0 ) {
3173
        if ( aSig == 0 ) return a;
3174
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175
    }
3176
    expDiff = aExp - bExp;
3177
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3178
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3179
    if ( expDiff < 0 ) {
3180
        if ( expDiff < -1 ) return a;
3181
        aSig >>= 1;
3182
    }
3183
    q = ( bSig <= aSig );
3184
    if ( q ) aSig -= bSig;
3185
    expDiff -= 64;
3186
    while ( 0 < expDiff ) {
3187
        q = estimateDiv128To64( aSig, 0, bSig );
3188
        q = ( 2 < q ) ? q - 2 : 0;
3189
        aSig = - ( ( bSig>>2 ) * q );
3190
        expDiff -= 62;
3191
    }
3192
    expDiff += 64;
3193
    if ( 0 < expDiff ) {
3194
        q = estimateDiv128To64( aSig, 0, bSig );
3195
        q = ( 2 < q ) ? q - 2 : 0;
3196
        q >>= 64 - expDiff;
3197
        bSig >>= 2;
3198
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3199
    }
3200
    else {
3201
        aSig >>= 2;
3202
        bSig >>= 2;
3203
    }
3204
    do {
3205
        alternateASig = aSig;
3206
        ++q;
3207
        aSig -= bSig;
3208
    } while ( 0 <= (sbits64) aSig );
3209
    sigMean = aSig + alternateASig;
3210
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3211
        aSig = alternateASig;
3212
    }
3213
    zSign = ( (sbits64) aSig < 0 );
3214
    if ( zSign ) aSig = - aSig;
3215
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3216

    
3217
}
3218

    
3219
/*----------------------------------------------------------------------------
3220
| Returns the square root of the double-precision floating-point value `a'.
3221
| The operation is performed according to the IEC/IEEE Standard for Binary
3222
| Floating-Point Arithmetic.
3223
*----------------------------------------------------------------------------*/
3224

    
3225
float64 float64_sqrt( float64 a STATUS_PARAM )
3226
{
3227
    flag aSign;
3228
    int16 aExp, zExp;
3229
    bits64 aSig, zSig, doubleZSig;
3230
    bits64 rem0, rem1, term0, term1;
3231

    
3232
    aSig = extractFloat64Frac( a );
3233
    aExp = extractFloat64Exp( a );
3234
    aSign = extractFloat64Sign( a );
3235
    if ( aExp == 0x7FF ) {
3236
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3237
        if ( ! aSign ) return a;
3238
        float_raise( float_flag_invalid STATUS_VAR);
3239
        return float64_default_nan;
3240
    }
3241
    if ( aSign ) {
3242
        if ( ( aExp | aSig ) == 0 ) return a;
3243
        float_raise( float_flag_invalid STATUS_VAR);
3244
        return float64_default_nan;
3245
    }
3246
    if ( aExp == 0 ) {
3247
        if ( aSig == 0 ) return float64_zero;
3248
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3249
    }
3250
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3251
    aSig |= LIT64( 0x0010000000000000 );
3252
    zSig = estimateSqrt32( aExp, aSig>>21 );
3253
    aSig <<= 9 - ( aExp & 1 );
3254
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3255
    if ( ( zSig & 0x1FF ) <= 5 ) {
3256
        doubleZSig = zSig<<1;
3257
        mul64To128( zSig, zSig, &term0, &term1 );
3258
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3259
        while ( (sbits64) rem0 < 0 ) {
3260
            --zSig;
3261
            doubleZSig -= 2;
3262
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3263
        }
3264
        zSig |= ( ( rem0 | rem1 ) != 0 );
3265
    }
3266
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3267

    
3268
}
3269

    
3270
/*----------------------------------------------------------------------------
3271
| Returns the binary log of the double-precision floating-point value `a'.
3272
| The operation is performed according to the IEC/IEEE Standard for Binary
3273
| Floating-Point Arithmetic.
3274
*----------------------------------------------------------------------------*/
3275
float64 float64_log2( float64 a STATUS_PARAM )
3276
{
3277
    flag aSign, zSign;
3278
    int16 aExp;
3279
    bits64 aSig, aSig0, aSig1, zSig, i;
3280

    
3281
    aSig = extractFloat64Frac( a );
3282
    aExp = extractFloat64Exp( a );
3283
    aSign = extractFloat64Sign( a );
3284

    
3285
    if ( aExp == 0 ) {
3286
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3287
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3288
    }
3289
    if ( aSign ) {
3290
        float_raise( float_flag_invalid STATUS_VAR);
3291
        return float64_default_nan;
3292
    }
3293
    if ( aExp == 0x7FF ) {
3294
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3295
        return a;
3296
    }
3297

    
3298
    aExp -= 0x3FF;
3299
    aSig |= LIT64( 0x0010000000000000 );
3300
    zSign = aExp < 0;
3301
    zSig = (bits64)aExp << 52;
3302
    for (i = 1LL << 51; i > 0; i >>= 1) {
3303
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3304
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3305
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3306
            aSig >>= 1;
3307
            zSig |= i;
3308
        }
3309
    }
3310

    
3311
    if ( zSign )
3312
        zSig = -zSig;
3313
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3314
}
3315

    
3316
/*----------------------------------------------------------------------------
3317
| Returns 1 if the double-precision floating-point value `a' is equal to the
3318
| corresponding value `b', and 0 otherwise.  The comparison is performed
3319
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3320
*----------------------------------------------------------------------------*/
3321

    
3322
int float64_eq( float64 a, float64 b STATUS_PARAM )
3323
{
3324
    bits64 av, bv;
3325

    
3326
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3327
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3328
       ) {
3329
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3330
            float_raise( float_flag_invalid STATUS_VAR);
3331
        }
3332
        return 0;
3333
    }
3334
    av = float64_val(a);
3335
    bv = float64_val(b);
3336
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3337

    
3338
}
3339

    
3340
/*----------------------------------------------------------------------------
3341
| Returns 1 if the double-precision floating-point value `a' is less than or
3342
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3343
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3344
| Arithmetic.
3345
*----------------------------------------------------------------------------*/
3346

    
3347
int float64_le( float64 a, float64 b STATUS_PARAM )
3348
{
3349
    flag aSign, bSign;
3350
    bits64 av, bv;
3351

    
3352
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3353
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3354
       ) {
3355
        float_raise( float_flag_invalid STATUS_VAR);
3356
        return 0;
3357
    }
3358
    aSign = extractFloat64Sign( a );
3359
    bSign = extractFloat64Sign( b );
3360
    av = float64_val(a);
3361
    bv = float64_val(b);
3362
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3363
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3364

    
3365
}
3366

    
3367
/*----------------------------------------------------------------------------
3368
| Returns 1 if the double-precision floating-point value `a' is less than
3369
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3370
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371
*----------------------------------------------------------------------------*/
3372

    
3373
int float64_lt( float64 a, float64 b STATUS_PARAM )
3374
{
3375
    flag aSign, bSign;
3376
    bits64 av, bv;
3377

    
3378
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3380
       ) {
3381
        float_raise( float_flag_invalid STATUS_VAR);
3382
        return 0;
3383
    }
3384
    aSign = extractFloat64Sign( a );
3385
    bSign = extractFloat64Sign( b );
3386
    av = float64_val(a);
3387
    bv = float64_val(b);
3388
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3389
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3390

    
3391
}
3392

    
3393
/*----------------------------------------------------------------------------
3394
| Returns 1 if the double-precision floating-point value `a' is equal to the
3395
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3396
| if either operand is a NaN.  Otherwise, the comparison is performed
3397
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398
*----------------------------------------------------------------------------*/
3399

    
3400
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3401
{
3402
    bits64 av, bv;
3403

    
3404
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3405
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3406
       ) {
3407
        float_raise( float_flag_invalid STATUS_VAR);
3408
        return 0;
3409
    }
3410
    av = float64_val(a);
3411
    bv = float64_val(b);
3412
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3413

    
3414
}
3415

    
3416
/*----------------------------------------------------------------------------
3417
| Returns 1 if the double-precision floating-point value `a' is less than or
3418
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3419
| cause an exception.  Otherwise, the comparison is performed according to the
3420
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3421
*----------------------------------------------------------------------------*/
3422

    
3423
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3424
{
3425
    flag aSign, bSign;
3426
    bits64 av, bv;
3427

    
3428
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3429
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3430
       ) {
3431
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3432
            float_raise( float_flag_invalid STATUS_VAR);
3433
        }
3434
        return 0;
3435
    }
3436
    aSign = extractFloat64Sign( a );
3437
    bSign = extractFloat64Sign( b );
3438
    av = float64_val(a);
3439
    bv = float64_val(b);
3440
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3441
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3442

    
3443
}
3444

    
3445
/*----------------------------------------------------------------------------
3446
| Returns 1 if the double-precision floating-point value `a' is less than
3447
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3448
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3449
| Standard for Binary Floating-Point Arithmetic.
3450
*----------------------------------------------------------------------------*/
3451

    
3452
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3453
{
3454
    flag aSign, bSign;
3455
    bits64 av, bv;
3456

    
3457
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3458
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3459
       ) {
3460
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3461
            float_raise( float_flag_invalid STATUS_VAR);
3462
        }
3463
        return 0;
3464
    }
3465
    aSign = extractFloat64Sign( a );
3466
    bSign = extractFloat64Sign( b );
3467
    av = float64_val(a);
3468
    bv = float64_val(b);
3469
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3470
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3471

    
3472
}
3473

    
3474
#ifdef FLOATX80
3475

    
3476
/*----------------------------------------------------------------------------
3477
| Returns the result of converting the extended double-precision floating-
3478
| point value `a' to the 32-bit two's complement integer format.  The
3479
| conversion is performed according to the IEC/IEEE Standard for Binary
3480
| Floating-Point Arithmetic---which means in particular that the conversion
3481
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3482
| largest positive integer is returned.  Otherwise, if the conversion
3483
| overflows, the largest integer with the same sign as `a' is returned.
3484
*----------------------------------------------------------------------------*/
3485

    
3486
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3487
{
3488
    flag aSign;
3489
    int32 aExp, shiftCount;
3490
    bits64 aSig;
3491

    
3492
    aSig = extractFloatx80Frac( a );
3493
    aExp = extractFloatx80Exp( a );
3494
    aSign = extractFloatx80Sign( a );
3495
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3496
    shiftCount = 0x4037 - aExp;
3497
    if ( shiftCount <= 0 ) shiftCount = 1;
3498
    shift64RightJamming( aSig, shiftCount, &aSig );
3499
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3500

    
3501
}
3502

    
3503
/*----------------------------------------------------------------------------
3504
| Returns the result of converting the extended double-precision floating-
3505
| point value `a' to the 32-bit two's complement integer format.  The
3506
| conversion is performed according to the IEC/IEEE Standard for Binary
3507
| Floating-Point Arithmetic, except that the conversion is always rounded
3508
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3509
| Otherwise, if the conversion overflows, the largest integer with the same
3510
| sign as `a' is returned.
3511
*----------------------------------------------------------------------------*/
3512

    
3513
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3514
{
3515
    flag aSign;
3516
    int32 aExp, shiftCount;
3517
    bits64 aSig, savedASig;
3518
    int32 z;
3519

    
3520
    aSig = extractFloatx80Frac( a );
3521
    aExp = extractFloatx80Exp( a );
3522
    aSign = extractFloatx80Sign( a );
3523
    if ( 0x401E < aExp ) {
3524
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3525
        goto invalid;
3526
    }
3527
    else if ( aExp < 0x3FFF ) {
3528
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3529
        return 0;
3530
    }
3531
    shiftCount = 0x403E - aExp;
3532
    savedASig = aSig;
3533
    aSig >>= shiftCount;
3534
    z = aSig;
3535
    if ( aSign ) z = - z;
3536
    if ( ( z < 0 ) ^ aSign ) {
3537
 invalid:
3538
        float_raise( float_flag_invalid STATUS_VAR);
3539
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3540
    }
3541
    if ( ( aSig<<shiftCount ) != savedASig ) {
3542
        STATUS(float_exception_flags) |= float_flag_inexact;
3543
    }
3544
    return z;
3545

    
3546
}
3547

    
3548
/*----------------------------------------------------------------------------
3549
| Returns the result of converting the extended double-precision floating-
3550
| point value `a' to the 64-bit two's complement integer format.  The
3551
| conversion is performed according to the IEC/IEEE Standard for Binary
3552
| Floating-Point Arithmetic---which means in particular that the conversion
3553
| is rounded according to the current rounding mode.  If `a' is a NaN,
3554
| the largest positive integer is returned.  Otherwise, if the conversion
3555
| overflows, the largest integer with the same sign as `a' is returned.
3556
*----------------------------------------------------------------------------*/
3557

    
3558
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3559
{
3560
    flag aSign;
3561
    int32 aExp, shiftCount;
3562
    bits64 aSig, aSigExtra;
3563

    
3564
    aSig = extractFloatx80Frac( a );
3565
    aExp = extractFloatx80Exp( a );
3566
    aSign = extractFloatx80Sign( a );
3567
    shiftCount = 0x403E - aExp;
3568
    if ( shiftCount <= 0 ) {
3569
        if ( shiftCount ) {
3570
            float_raise( float_flag_invalid STATUS_VAR);
3571
            if (    ! aSign
3572
                 || (    ( aExp == 0x7FFF )
3573
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3574
               ) {
3575
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3576
            }
3577
            return (sbits64) LIT64( 0x8000000000000000 );
3578
        }
3579
        aSigExtra = 0;
3580
    }
3581
    else {
3582
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3583
    }
3584
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3585

    
3586
}
3587

    
3588
/*----------------------------------------------------------------------------
3589
| Returns the result of converting the extended double-precision floating-
3590
| point value `a' to the 64-bit two's complement integer format.  The
3591
| conversion is performed according to the IEC/IEEE Standard for Binary
3592
| Floating-Point Arithmetic, except that the conversion is always rounded
3593
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3594
| Otherwise, if the conversion overflows, the largest integer with the same
3595
| sign as `a' is returned.
3596
*----------------------------------------------------------------------------*/
3597

    
3598
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3599
{
3600
    flag aSign;
3601
    int32 aExp, shiftCount;
3602
    bits64 aSig;
3603
    int64 z;
3604

    
3605
    aSig = extractFloatx80Frac( a );
3606
    aExp = extractFloatx80Exp( a );
3607
    aSign = extractFloatx80Sign( a );
3608
    shiftCount = aExp - 0x403E;
3609
    if ( 0 <= shiftCount ) {
3610
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3611
        if ( ( a.high != 0xC03E ) || aSig ) {
3612
            float_raise( float_flag_invalid STATUS_VAR);
3613
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3614
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3615
            }
3616
        }
3617
        return (sbits64) LIT64( 0x8000000000000000 );
3618
    }
3619
    else if ( aExp < 0x3FFF ) {
3620
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3621
        return 0;
3622
    }
3623
    z = aSig>>( - shiftCount );
3624
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3625
        STATUS(float_exception_flags) |= float_flag_inexact;
3626
    }
3627
    if ( aSign ) z = - z;
3628
    return z;
3629

    
3630
}
3631

    
3632
/*----------------------------------------------------------------------------
3633
| Returns the result of converting the extended double-precision floating-
3634
| point value `a' to the single-precision floating-point format.  The
3635
| conversion is performed according to the IEC/IEEE Standard for Binary
3636
| Floating-Point Arithmetic.
3637
*----------------------------------------------------------------------------*/
3638

    
3639
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3640
{
3641
    flag aSign;
3642
    int32 aExp;
3643
    bits64 aSig;
3644

    
3645
    aSig = extractFloatx80Frac( a );
3646
    aExp = extractFloatx80Exp( a );
3647
    aSign = extractFloatx80Sign( a );
3648
    if ( aExp == 0x7FFF ) {
3649
        if ( (bits64) ( aSig<<1 ) ) {
3650
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3651
        }
3652
        return packFloat32( aSign, 0xFF, 0 );
3653
    }
3654
    shift64RightJamming( aSig, 33, &aSig );
3655
    if ( aExp || aSig ) aExp -= 0x3F81;
3656
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3657

    
3658
}
3659

    
3660
/*----------------------------------------------------------------------------
3661
| Returns the result of converting the extended double-precision floating-
3662
| point value `a' to the double-precision floating-point format.  The
3663
| conversion is performed according to the IEC/IEEE Standard for Binary
3664
| Floating-Point Arithmetic.
3665
*----------------------------------------------------------------------------*/
3666

    
3667
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3668
{
3669
    flag aSign;
3670
    int32 aExp;
3671
    bits64 aSig, zSig;
3672

    
3673
    aSig = extractFloatx80Frac( a );
3674
    aExp = extractFloatx80Exp( a );
3675
    aSign = extractFloatx80Sign( a );
3676
    if ( aExp == 0x7FFF ) {
3677
        if ( (bits64) ( aSig<<1 ) ) {
3678
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3679
        }
3680
        return packFloat64( aSign, 0x7FF, 0 );
3681
    }
3682
    shift64RightJamming( aSig, 1, &zSig );
3683
    if ( aExp || aSig ) aExp -= 0x3C01;
3684
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3685

    
3686
}
3687

    
3688
#ifdef FLOAT128
3689

    
3690
/*----------------------------------------------------------------------------
3691
| Returns the result of converting the extended double-precision floating-
3692
| point value `a' to the quadruple-precision floating-point format.  The
3693
| conversion is performed according to the IEC/IEEE Standard for Binary
3694
| Floating-Point Arithmetic.
3695
*----------------------------------------------------------------------------*/
3696

    
3697
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3698
{
3699
    flag aSign;
3700
    int16 aExp;
3701
    bits64 aSig, zSig0, zSig1;
3702

    
3703
    aSig = extractFloatx80Frac( a );
3704
    aExp = extractFloatx80Exp( a );
3705
    aSign = extractFloatx80Sign( a );
3706
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3707
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3708
    }
3709
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3710
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3711

    
3712
}
3713

    
3714
#endif
3715

    
3716
/*----------------------------------------------------------------------------
3717
| Rounds the extended double-precision floating-point value `a' to an integer,
3718
| and returns the result as an extended quadruple-precision floating-point
3719
| value.  The operation is performed according to the IEC/IEEE Standard for
3720
| Binary Floating-Point Arithmetic.
3721
*----------------------------------------------------------------------------*/
3722

    
3723
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3724
{
3725
    flag aSign;
3726
    int32 aExp;
3727
    bits64 lastBitMask, roundBitsMask;
3728
    int8 roundingMode;
3729
    floatx80 z;
3730

    
3731
    aExp = extractFloatx80Exp( a );
3732
    if ( 0x403E <= aExp ) {
3733
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3734
            return propagateFloatx80NaN( a, a STATUS_VAR );
3735
        }
3736
        return a;
3737
    }
3738
    if ( aExp < 0x3FFF ) {
3739
        if (    ( aExp == 0 )
3740
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3741
            return a;
3742
        }
3743
        STATUS(float_exception_flags) |= float_flag_inexact;
3744
        aSign = extractFloatx80Sign( a );
3745
        switch ( STATUS(float_rounding_mode) ) {
3746
         case float_round_nearest_even:
3747
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3748
               ) {
3749
                return
3750
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3751
            }
3752
            break;
3753
         case float_round_down:
3754
            return
3755
                  aSign ?
3756
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3757
                : packFloatx80( 0, 0, 0 );
3758
         case float_round_up:
3759
            return
3760
                  aSign ? packFloatx80( 1, 0, 0 )
3761
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3762
        }
3763
        return packFloatx80( aSign, 0, 0 );
3764
    }
3765
    lastBitMask = 1;
3766
    lastBitMask <<= 0x403E - aExp;
3767
    roundBitsMask = lastBitMask - 1;
3768
    z = a;
3769
    roundingMode = STATUS(float_rounding_mode);
3770
    if ( roundingMode == float_round_nearest_even ) {
3771
        z.low += lastBitMask>>1;
3772
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3773
    }
3774
    else if ( roundingMode != float_round_to_zero ) {
3775
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3776
            z.low += roundBitsMask;
3777
        }
3778
    }
3779
    z.low &= ~ roundBitsMask;
3780
    if ( z.low == 0 ) {
3781
        ++z.high;
3782
        z.low = LIT64( 0x8000000000000000 );
3783
    }
3784
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3785
    return z;
3786

    
3787
}
3788

    
3789
/*----------------------------------------------------------------------------
3790
| Returns the result of adding the absolute values of the extended double-
3791
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3792
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3793
| The addition is performed according to the IEC/IEEE Standard for Binary
3794
| Floating-Point Arithmetic.
3795
*----------------------------------------------------------------------------*/
3796

    
3797
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3798
{
3799
    int32 aExp, bExp, zExp;
3800
    bits64 aSig, bSig, zSig0, zSig1;
3801
    int32 expDiff;
3802

    
3803
    aSig = extractFloatx80Frac( a );
3804
    aExp = extractFloatx80Exp( a );
3805
    bSig = extractFloatx80Frac( b );
3806
    bExp = extractFloatx80Exp( b );
3807
    expDiff = aExp - bExp;
3808
    if ( 0 < expDiff ) {
3809
        if ( aExp == 0x7FFF ) {
3810
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3811
            return a;
3812
        }
3813
        if ( bExp == 0 ) --expDiff;
3814
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3815
        zExp = aExp;
3816
    }
3817
    else if ( expDiff < 0 ) {
3818
        if ( bExp == 0x7FFF ) {
3819
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3820
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3821
        }
3822
        if ( aExp == 0 ) ++expDiff;
3823
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3824
        zExp = bExp;
3825
    }
3826
    else {
3827
        if ( aExp == 0x7FFF ) {
3828
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3829
                return propagateFloatx80NaN( a, b STATUS_VAR );
3830
            }
3831
            return a;
3832
        }
3833
        zSig1 = 0;
3834
        zSig0 = aSig + bSig;
3835
        if ( aExp == 0 ) {
3836
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3837
            goto roundAndPack;
3838
        }
3839
        zExp = aExp;
3840
        goto shiftRight1;
3841
    }
3842
    zSig0 = aSig + bSig;
3843
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3844
 shiftRight1:
3845
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3846
    zSig0 |= LIT64( 0x8000000000000000 );
3847
    ++zExp;
3848
 roundAndPack:
3849
    return
3850
        roundAndPackFloatx80(
3851
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3852

    
3853
}
3854

    
3855
/*----------------------------------------------------------------------------
3856
| Returns the result of subtracting the absolute values of the extended
3857
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3858
| difference is negated before being returned.  `zSign' is ignored if the
3859
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3860
| Standard for Binary Floating-Point Arithmetic.
3861
*----------------------------------------------------------------------------*/
3862

    
3863
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3864
{
3865
    int32 aExp, bExp, zExp;
3866
    bits64 aSig, bSig, zSig0, zSig1;
3867
    int32 expDiff;
3868
    floatx80 z;
3869

    
3870
    aSig = extractFloatx80Frac( a );
3871
    aExp = extractFloatx80Exp( a );
3872
    bSig = extractFloatx80Frac( b );
3873
    bExp = extractFloatx80Exp( b );
3874
    expDiff = aExp - bExp;
3875
    if ( 0 < expDiff ) goto aExpBigger;
3876
    if ( expDiff < 0 ) goto bExpBigger;
3877
    if ( aExp == 0x7FFF ) {
3878
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3879
            return propagateFloatx80NaN( a, b STATUS_VAR );
3880
        }
3881
        float_raise( float_flag_invalid STATUS_VAR);
3882
        z.low = floatx80_default_nan_low;
3883
        z.high = floatx80_default_nan_high;
3884
        return z;
3885
    }
3886
    if ( aExp == 0 ) {
3887
        aExp = 1;
3888
        bExp = 1;
3889
    }
3890
    zSig1 = 0;
3891
    if ( bSig < aSig ) goto aBigger;
3892
    if ( aSig < bSig ) goto bBigger;
3893
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3894
 bExpBigger:
3895
    if ( bExp == 0x7FFF ) {
3896
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3897
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3898
    }
3899
    if ( aExp == 0 ) ++expDiff;
3900
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3901
 bBigger:
3902
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3903
    zExp = bExp;
3904
    zSign ^= 1;
3905
    goto normalizeRoundAndPack;
3906
 aExpBigger:
3907
    if ( aExp == 0x7FFF ) {
3908
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3909
        return a;
3910
    }
3911
    if ( bExp == 0 ) --expDiff;
3912
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3913
 aBigger:
3914
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3915
    zExp = aExp;
3916
 normalizeRoundAndPack:
3917
    return
3918
        normalizeRoundAndPackFloatx80(
3919
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3920

    
3921
}
3922

    
3923
/*----------------------------------------------------------------------------
3924
| Returns the result of adding the extended double-precision floating-point
3925
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3926
| Standard for Binary Floating-Point Arithmetic.
3927
*----------------------------------------------------------------------------*/
3928

    
3929
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3930
{
3931
    flag aSign, bSign;
3932

    
3933
    aSign = extractFloatx80Sign( a );
3934
    bSign = extractFloatx80Sign( b );
3935
    if ( aSign == bSign ) {
3936
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3937
    }
3938
    else {
3939
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3940
    }
3941

    
3942
}
3943

    
3944
/*----------------------------------------------------------------------------
3945
| Returns the result of subtracting the extended double-precision floating-
3946
| point values `a' and `b'.  The operation is performed according to the
3947
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948
*----------------------------------------------------------------------------*/
3949

    
3950
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3951
{
3952
    flag aSign, bSign;
3953

    
3954
    aSign = extractFloatx80Sign( a );
3955
    bSign = extractFloatx80Sign( b );
3956
    if ( aSign == bSign ) {
3957
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3958
    }
3959
    else {
3960
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3961
    }
3962

    
3963
}
3964

    
3965
/*----------------------------------------------------------------------------
3966
| Returns the result of multiplying the extended double-precision floating-
3967
| point values `a' and `b'.  The operation is performed according to the
3968
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969
*----------------------------------------------------------------------------*/
3970

    
3971
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3972
{
3973
    flag aSign, bSign, zSign;
3974
    int32 aExp, bExp, zExp;
3975
    bits64 aSig, bSig, zSig0, zSig1;
3976
    floatx80 z;
3977

    
3978
    aSig = extractFloatx80Frac( a );
3979
    aExp = extractFloatx80Exp( a );
3980
    aSign = extractFloatx80Sign( a );
3981
    bSig = extractFloatx80Frac( b );
3982
    bExp = extractFloatx80Exp( b );
3983
    bSign = extractFloatx80Sign( b );
3984
    zSign = aSign ^ bSign;
3985
    if ( aExp == 0x7FFF ) {
3986
        if (    (bits64) ( aSig<<1 )
3987
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3988
            return propagateFloatx80NaN( a, b STATUS_VAR );
3989
        }
3990
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3991
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3992
    }
3993
    if ( bExp == 0x7FFF ) {
3994
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3995
        if ( ( aExp | aSig ) == 0 ) {
3996
 invalid:
3997
            float_raise( float_flag_invalid STATUS_VAR);
3998
            z.low = floatx80_default_nan_low;
3999
            z.high = floatx80_default_nan_high;
4000
            return z;
4001
        }
4002
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4003
    }
4004
    if ( aExp == 0 ) {
4005
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4006
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4007
    }
4008
    if ( bExp == 0 ) {
4009
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4010
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4011
    }
4012
    zExp = aExp + bExp - 0x3FFE;
4013
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4014
    if ( 0 < (sbits64) zSig0 ) {
4015
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4016
        --zExp;
4017
    }
4018
    return
4019
        roundAndPackFloatx80(
4020
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4021

    
4022
}
4023

    
4024
/*----------------------------------------------------------------------------
4025
| Returns the result of dividing the extended double-precision floating-point
4026
| value `a' by the corresponding value `b'.  The operation is performed
4027
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4028
*----------------------------------------------------------------------------*/
4029

    
4030
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4031
{
4032
    flag aSign, bSign, zSign;
4033
    int32 aExp, bExp, zExp;
4034
    bits64 aSig, bSig, zSig0, zSig1;
4035
    bits64 rem0, rem1, rem2, term0, term1, term2;
4036
    floatx80 z;
4037

    
4038
    aSig = extractFloatx80Frac( a );
4039
    aExp = extractFloatx80Exp( a );
4040
    aSign = extractFloatx80Sign( a );
4041
    bSig = extractFloatx80Frac( b );
4042
    bExp = extractFloatx80Exp( b );
4043
    bSign = extractFloatx80Sign( b );
4044
    zSign = aSign ^ bSign;
4045
    if ( aExp == 0x7FFF ) {
4046
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4047
        if ( bExp == 0x7FFF ) {
4048
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4049
            goto invalid;
4050
        }
4051
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4052
    }
4053
    if ( bExp == 0x7FFF ) {
4054
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4055
        return packFloatx80( zSign, 0, 0 );
4056
    }
4057
    if ( bExp == 0 ) {
4058
        if ( bSig == 0 ) {
4059
            if ( ( aExp | aSig ) == 0 ) {
4060
 invalid:
4061
                float_raise( float_flag_invalid STATUS_VAR);
4062
                z.low = floatx80_default_nan_low;
4063
                z.high = floatx80_default_nan_high;
4064
                return z;
4065
            }
4066
            float_raise( float_flag_divbyzero STATUS_VAR);
4067
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4068
        }
4069
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4070
    }
4071
    if ( aExp == 0 ) {
4072
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4073
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4074
    }
4075
    zExp = aExp - bExp + 0x3FFE;
4076
    rem1 = 0;
4077
    if ( bSig <= aSig ) {
4078
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4079
        ++zExp;
4080
    }
4081
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4082
    mul64To128( bSig, zSig0, &term0, &term1 );
4083
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4084
    while ( (sbits64) rem0 < 0 ) {
4085
        --zSig0;
4086
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4087
    }
4088
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4089
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4090
        mul64To128( bSig, zSig1, &term1, &term2 );
4091
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4092
        while ( (sbits64) rem1 < 0 ) {
4093
            --zSig1;
4094
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4095
        }
4096
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4097
    }
4098
    return
4099
        roundAndPackFloatx80(
4100
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4101

    
4102
}
4103

    
4104
/*----------------------------------------------------------------------------
4105
| Returns the remainder of the extended double-precision floating-point value
4106
| `a' with respect to the corresponding value `b'.  The operation is performed
4107
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4108
*----------------------------------------------------------------------------*/
4109

    
4110
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4111
{
4112
    flag aSign, zSign;
4113
    int32 aExp, bExp, expDiff;
4114
    bits64 aSig0, aSig1, bSig;
4115
    bits64 q, term0, term1, alternateASig0, alternateASig1;
4116
    floatx80 z;
4117

    
4118
    aSig0 = extractFloatx80Frac( a );
4119
    aExp = extractFloatx80Exp( a );
4120
    aSign = extractFloatx80Sign( a );
4121
    bSig = extractFloatx80Frac( b );
4122
    bExp = extractFloatx80Exp( b );
4123
    if ( aExp == 0x7FFF ) {
4124
        if (    (bits64) ( aSig0<<1 )
4125
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4126
            return propagateFloatx80NaN( a, b STATUS_VAR );
4127
        }
4128
        goto invalid;
4129
    }
4130
    if ( bExp == 0x7FFF ) {
4131
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4132
        return a;
4133
    }
4134
    if ( bExp == 0 ) {
4135
        if ( bSig == 0 ) {
4136
 invalid:
4137
            float_raise( float_flag_invalid STATUS_VAR);
4138
            z.low = floatx80_default_nan_low;
4139
            z.high = floatx80_default_nan_high;
4140
            return z;
4141
        }
4142
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4143
    }
4144
    if ( aExp == 0 ) {
4145
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4146
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4147
    }
4148
    bSig |= LIT64( 0x8000000000000000 );
4149
    zSign = aSign;
4150
    expDiff = aExp - bExp;
4151
    aSig1 = 0;
4152
    if ( expDiff < 0 ) {
4153
        if ( expDiff < -1 ) return a;
4154
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4155
        expDiff = 0;
4156
    }
4157
    q = ( bSig <= aSig0 );
4158
    if ( q ) aSig0 -= bSig;
4159
    expDiff -= 64;
4160
    while ( 0 < expDiff ) {
4161
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4162
        q = ( 2 < q ) ? q - 2 : 0;
4163
        mul64To128( bSig, q, &term0, &term1 );
4164
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4165
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4166
        expDiff -= 62;
4167
    }
4168
    expDiff += 64;
4169
    if ( 0 < expDiff ) {
4170
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4171
        q = ( 2 < q ) ? q - 2 : 0;
4172
        q >>= 64 - expDiff;
4173
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4174
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4175
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4176
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4177
            ++q;
4178
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4179
        }
4180
    }
4181
    else {
4182
        term1 = 0;
4183
        term0 = bSig;
4184
    }
4185
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4186
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4187
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4188
              && ( q & 1 ) )
4189
       ) {
4190
        aSig0 = alternateASig0;
4191
        aSig1 = alternateASig1;
4192
        zSign = ! zSign;
4193
    }
4194
    return
4195
        normalizeRoundAndPackFloatx80(
4196
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4197

    
4198
}
4199

    
4200
/*----------------------------------------------------------------------------
4201
| Returns the square root of the extended double-precision floating-point
4202
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4203
| for Binary Floating-Point Arithmetic.
4204
*----------------------------------------------------------------------------*/
4205

    
4206
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4207
{
4208
    flag aSign;
4209
    int32 aExp, zExp;
4210
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4211
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4212
    floatx80 z;
4213

    
4214
    aSig0 = extractFloatx80Frac( a );
4215
    aExp = extractFloatx80Exp( a );
4216
    aSign = extractFloatx80Sign( a );
4217
    if ( aExp == 0x7FFF ) {
4218
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4219
        if ( ! aSign ) return a;
4220
        goto invalid;
4221
    }
4222
    if ( aSign ) {
4223
        if ( ( aExp | aSig0 ) == 0 ) return a;
4224
 invalid:
4225
        float_raise( float_flag_invalid STATUS_VAR);
4226
        z.low = floatx80_default_nan_low;
4227
        z.high = floatx80_default_nan_high;
4228
        return z;
4229
    }
4230
    if ( aExp == 0 ) {
4231
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4232
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4233
    }
4234
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4235
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4236
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4237
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4238
    doubleZSig0 = zSig0<<1;
4239
    mul64To128( zSig0, zSig0, &term0, &term1 );
4240
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4241
    while ( (sbits64) rem0 < 0 ) {
4242
        --zSig0;
4243
        doubleZSig0 -= 2;
4244
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4245
    }
4246
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4247
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4248
        if ( zSig1 == 0 ) zSig1 = 1;
4249
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4250
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4251
        mul64To128( zSig1, zSig1, &term2, &term3 );
4252
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4253
        while ( (sbits64) rem1 < 0 ) {
4254
            --zSig1;
4255
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4256
            term3 |= 1;
4257
            term2 |= doubleZSig0;
4258
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4259
        }
4260
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4261
    }
4262
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4263
    zSig0 |= doubleZSig0;
4264
    return
4265
        roundAndPackFloatx80(
4266
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4267

    
4268
}
4269

    
4270
/*----------------------------------------------------------------------------
4271
| Returns 1 if the extended double-precision floating-point value `a' is
4272
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4273
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4274
| Arithmetic.
4275
*----------------------------------------------------------------------------*/
4276

    
4277
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4278
{
4279

    
4280
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4281
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4282
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4283
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4284
       ) {
4285
        if (    floatx80_is_signaling_nan( a )
4286
             || floatx80_is_signaling_nan( b ) ) {
4287
            float_raise( float_flag_invalid STATUS_VAR);
4288
        }
4289
        return 0;
4290
    }
4291
    return
4292
           ( a.low == b.low )
4293
        && (    ( a.high == b.high )
4294
             || (    ( a.low == 0 )
4295
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4296
           );
4297

    
4298
}
4299

    
4300
/*----------------------------------------------------------------------------
4301
| Returns 1 if the extended double-precision floating-point value `a' is
4302
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4303
| comparison is performed according to the IEC/IEEE Standard for Binary
4304
| Floating-Point Arithmetic.
4305
*----------------------------------------------------------------------------*/
4306

    
4307
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4308
{
4309
    flag aSign, bSign;
4310

    
4311
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4312
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4313
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4314
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4315
       ) {
4316
        float_raise( float_flag_invalid STATUS_VAR);
4317
        return 0;
4318
    }
4319
    aSign = extractFloatx80Sign( a );
4320
    bSign = extractFloatx80Sign( b );
4321
    if ( aSign != bSign ) {
4322
        return
4323
               aSign
4324
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4325
                 == 0 );
4326
    }
4327
    return
4328
          aSign ? le128( b.high, b.low, a.high, a.low )
4329
        : le128( a.high, a.low, b.high, b.low );
4330

    
4331
}
4332

    
4333
/*----------------------------------------------------------------------------
4334
| Returns 1 if the extended double-precision floating-point value `a' is
4335
| less than the corresponding value `b', and 0 otherwise.  The comparison
4336
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337
| Arithmetic.
4338
*----------------------------------------------------------------------------*/
4339

    
4340
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4341
{
4342
    flag aSign, bSign;
4343

    
4344
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4345
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4346
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4347
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4348
       ) {
4349
        float_raise( float_flag_invalid STATUS_VAR);
4350
        return 0;
4351
    }
4352
    aSign = extractFloatx80Sign( a );
4353
    bSign = extractFloatx80Sign( b );
4354
    if ( aSign != bSign ) {
4355
        return
4356
               aSign
4357
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4358
                 != 0 );
4359
    }
4360
    return
4361
          aSign ? lt128( b.high, b.low, a.high, a.low )
4362
        : lt128( a.high, a.low, b.high, b.low );
4363

    
4364
}
4365

    
4366
/*----------------------------------------------------------------------------
4367
| Returns 1 if the extended double-precision floating-point value `a' is equal
4368
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4369
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4370
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4371
*----------------------------------------------------------------------------*/
4372

    
4373
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4374
{
4375

    
4376
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4377
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4378
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4379
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4380
       ) {
4381
        float_raise( float_flag_invalid STATUS_VAR);
4382
        return 0;
4383
    }
4384
    return
4385
           ( a.low == b.low )
4386
        && (    ( a.high == b.high )
4387
             || (    ( a.low == 0 )
4388
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4389
           );
4390

    
4391
}
4392

    
4393
/*----------------------------------------------------------------------------
4394
| Returns 1 if the extended double-precision floating-point value `a' is less
4395
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4396
| do not cause an exception.  Otherwise, the comparison is performed according
4397
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4398
*----------------------------------------------------------------------------*/
4399

    
4400
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4401
{
4402
    flag aSign, bSign;
4403

    
4404
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4405
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4406
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4407
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4408
       ) {
4409
        if (    floatx80_is_signaling_nan( a )
4410
             || floatx80_is_signaling_nan( b ) ) {
4411
            float_raise( float_flag_invalid STATUS_VAR);
4412
        }
4413
        return 0;
4414
    }
4415
    aSign = extractFloatx80Sign( a );
4416
    bSign = extractFloatx80Sign( b );
4417
    if ( aSign != bSign ) {
4418
        return
4419
               aSign
4420
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4421
                 == 0 );
4422
    }
4423
    return
4424
          aSign ? le128( b.high, b.low, a.high, a.low )
4425
        : le128( a.high, a.low, b.high, b.low );
4426

    
4427
}
4428

    
4429
/*----------------------------------------------------------------------------
4430
| Returns 1 if the extended double-precision floating-point value `a' is less
4431
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4432
| an exception.  Otherwise, the comparison is performed according to the
4433
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4434
*----------------------------------------------------------------------------*/
4435

    
4436
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4437
{
4438
    flag aSign, bSign;
4439

    
4440
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4441
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4442
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4443
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4444
       ) {
4445
        if (    floatx80_is_signaling_nan( a )
4446
             || floatx80_is_signaling_nan( b ) ) {
4447
            float_raise( float_flag_invalid STATUS_VAR);
4448
        }
4449
        return 0;
4450
    }
4451
    aSign = extractFloatx80Sign( a );
4452
    bSign = extractFloatx80Sign( b );
4453
    if ( aSign != bSign ) {
4454
        return
4455
               aSign
4456
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4457
                 != 0 );
4458
    }
4459
    return
4460
          aSign ? lt128( b.high, b.low, a.high, a.low )
4461
        : lt128( a.high, a.low, b.high, b.low );
4462

    
4463
}
4464

    
4465
#endif
4466

    
4467
#ifdef FLOAT128
4468

    
4469
/*----------------------------------------------------------------------------
4470
| Returns the result of converting the quadruple-precision floating-point
4471
| value `a' to the 32-bit two's complement integer format.  The conversion
4472
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4473
| Arithmetic---which means in particular that the conversion is rounded
4474
| according to the current rounding mode.  If `a' is a NaN, the largest
4475
| positive integer is returned.  Otherwise, if the conversion overflows, the
4476
| largest integer with the same sign as `a' is returned.
4477
*----------------------------------------------------------------------------*/
4478

    
4479
int32 float128_to_int32( float128 a STATUS_PARAM )
4480
{
4481
    flag aSign;
4482
    int32 aExp, shiftCount;
4483
    bits64 aSig0, aSig1;
4484

    
4485
    aSig1 = extractFloat128Frac1( a );
4486
    aSig0 = extractFloat128Frac0( a );
4487
    aExp = extractFloat128Exp( a );
4488
    aSign = extractFloat128Sign( a );
4489
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4490
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4491
    aSig0 |= ( aSig1 != 0 );
4492
    shiftCount = 0x4028 - aExp;
4493
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4494
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4495

    
4496
}
4497

    
4498
/*----------------------------------------------------------------------------
4499
| Returns the result of converting the quadruple-precision floating-point
4500
| value `a' to the 32-bit two's complement integer format.  The conversion
4501
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4502
| Arithmetic, except that the conversion is always rounded toward zero.  If
4503
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4504
| conversion overflows, the largest integer with the same sign as `a' is
4505
| returned.
4506
*----------------------------------------------------------------------------*/
4507

    
4508
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4509
{
4510
    flag aSign;
4511
    int32 aExp, shiftCount;
4512
    bits64 aSig0, aSig1, savedASig;
4513
    int32 z;
4514

    
4515
    aSig1 = extractFloat128Frac1( a );
4516
    aSig0 = extractFloat128Frac0( a );
4517
    aExp = extractFloat128Exp( a );
4518
    aSign = extractFloat128Sign( a );
4519
    aSig0 |= ( aSig1 != 0 );
4520
    if ( 0x401E < aExp ) {
4521
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4522
        goto invalid;
4523
    }
4524
    else if ( aExp < 0x3FFF ) {
4525
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4526
        return 0;
4527
    }
4528
    aSig0 |= LIT64( 0x0001000000000000 );
4529
    shiftCount = 0x402F - aExp;
4530
    savedASig = aSig0;
4531
    aSig0 >>= shiftCount;
4532
    z = aSig0;
4533
    if ( aSign ) z = - z;
4534
    if ( ( z < 0 ) ^ aSign ) {
4535
 invalid:
4536
        float_raise( float_flag_invalid STATUS_VAR);
4537
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4538
    }
4539
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4540
        STATUS(float_exception_flags) |= float_flag_inexact;
4541
    }
4542
    return z;
4543

    
4544
}
4545

    
4546
/*----------------------------------------------------------------------------
4547
| Returns the result of converting the quadruple-precision floating-point
4548
| value `a' to the 64-bit two's complement integer format.  The conversion
4549
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4550
| Arithmetic---which means in particular that the conversion is rounded
4551
| according to the current rounding mode.  If `a' is a NaN, the largest
4552
| positive integer is returned.  Otherwise, if the conversion overflows, the
4553
| largest integer with the same sign as `a' is returned.
4554
*----------------------------------------------------------------------------*/
4555

    
4556
int64 float128_to_int64( float128 a STATUS_PARAM )
4557
{
4558
    flag aSign;
4559
    int32 aExp, shiftCount;
4560
    bits64 aSig0, aSig1;
4561

    
4562
    aSig1 = extractFloat128Frac1( a );
4563
    aSig0 = extractFloat128Frac0( a );
4564
    aExp = extractFloat128Exp( a );
4565
    aSign = extractFloat128Sign( a );
4566
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4567
    shiftCount = 0x402F - aExp;
4568
    if ( shiftCount <= 0 ) {
4569
        if ( 0x403E < aExp ) {
4570
            float_raise( float_flag_invalid STATUS_VAR);
4571
            if (    ! aSign
4572
                 || (    ( aExp == 0x7FFF )
4573
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4574
                    )
4575
               ) {
4576
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4577
            }
4578
            return (sbits64) LIT64( 0x8000000000000000 );
4579
        }
4580
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4581
    }
4582
    else {
4583
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4584
    }
4585
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4586

    
4587
}
4588

    
4589
/*----------------------------------------------------------------------------
4590
| Returns the result of converting the quadruple-precision floating-point
4591
| value `a' to the 64-bit two's complement integer format.  The conversion
4592
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4593
| Arithmetic, except that the conversion is always rounded toward zero.
4594
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4595
| the conversion overflows, the largest integer with the same sign as `a' is
4596
| returned.
4597
*----------------------------------------------------------------------------*/
4598

    
4599
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4600
{
4601
    flag aSign;
4602
    int32 aExp, shiftCount;
4603
    bits64 aSig0, aSig1;
4604
    int64 z;
4605

    
4606
    aSig1 = extractFloat128Frac1( a );
4607
    aSig0 = extractFloat128Frac0( a );
4608
    aExp = extractFloat128Exp( a );
4609
    aSign = extractFloat128Sign( a );
4610
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4611
    shiftCount = aExp - 0x402F;
4612
    if ( 0 < shiftCount ) {
4613
        if ( 0x403E <= aExp ) {
4614
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4615
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4616
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4617
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4618
            }
4619
            else {
4620
                float_raise( float_flag_invalid STATUS_VAR);
4621
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4622
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4623
                }
4624
            }
4625
            return (sbits64) LIT64( 0x8000000000000000 );
4626
        }
4627
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4628
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4629
            STATUS(float_exception_flags) |= float_flag_inexact;
4630
        }
4631
    }
4632
    else {
4633
        if ( aExp < 0x3FFF ) {
4634
            if ( aExp | aSig0 | aSig1 ) {
4635
                STATUS(float_exception_flags) |= float_flag_inexact;
4636
            }
4637
            return 0;
4638
        }
4639
        z = aSig0>>( - shiftCount );
4640
        if (    aSig1
4641
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4642
            STATUS(float_exception_flags) |= float_flag_inexact;
4643
        }
4644
    }
4645
    if ( aSign ) z = - z;
4646
    return z;
4647

    
4648
}
4649

    
4650
/*----------------------------------------------------------------------------
4651
| Returns the result of converting the quadruple-precision floating-point
4652
| value `a' to the single-precision floating-point format.  The conversion
4653
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4654
| Arithmetic.
4655
*----------------------------------------------------------------------------*/
4656

    
4657
float32 float128_to_float32( float128 a STATUS_PARAM )
4658
{
4659
    flag aSign;
4660
    int32 aExp;
4661
    bits64 aSig0, aSig1;
4662
    bits32 zSig;
4663

    
4664
    aSig1 = extractFloat128Frac1( a );
4665
    aSig0 = extractFloat128Frac0( a );
4666
    aExp = extractFloat128Exp( a );
4667
    aSign = extractFloat128Sign( a );
4668
    if ( aExp == 0x7FFF ) {
4669
        if ( aSig0 | aSig1 ) {
4670
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4671
        }
4672
        return packFloat32( aSign, 0xFF, 0 );
4673
    }
4674
    aSig0 |= ( aSig1 != 0 );
4675
    shift64RightJamming( aSig0, 18, &aSig0 );
4676
    zSig = aSig0;
4677
    if ( aExp || zSig ) {
4678
        zSig |= 0x40000000;
4679
        aExp -= 0x3F81;
4680
    }
4681
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4682

    
4683
}
4684

    
4685
/*----------------------------------------------------------------------------
4686
| Returns the result of converting the quadruple-precision floating-point
4687
| value `a' to the double-precision floating-point format.  The conversion
4688
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4689
| Arithmetic.
4690
*----------------------------------------------------------------------------*/
4691

    
4692
float64 float128_to_float64( float128 a STATUS_PARAM )
4693
{
4694
    flag aSign;
4695
    int32 aExp;
4696
    bits64 aSig0, aSig1;
4697

    
4698
    aSig1 = extractFloat128Frac1( a );
4699
    aSig0 = extractFloat128Frac0( a );
4700
    aExp = extractFloat128Exp( a );
4701
    aSign = extractFloat128Sign( a );
4702
    if ( aExp == 0x7FFF ) {
4703
        if ( aSig0 | aSig1 ) {
4704
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4705
        }
4706
        return packFloat64( aSign, 0x7FF, 0 );
4707
    }
4708
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4709
    aSig0 |= ( aSig1 != 0 );
4710
    if ( aExp || aSig0 ) {
4711
        aSig0 |= LIT64( 0x4000000000000000 );
4712
        aExp -= 0x3C01;
4713
    }
4714
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4715

    
4716
}
4717

    
4718
#ifdef FLOATX80
4719

    
4720
/*----------------------------------------------------------------------------
4721
| Returns the result of converting the quadruple-precision floating-point
4722
| value `a' to the extended double-precision floating-point format.  The
4723
| conversion is performed according to the IEC/IEEE Standard for Binary
4724
| Floating-Point Arithmetic.
4725
*----------------------------------------------------------------------------*/
4726

    
4727
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4728
{
4729
    flag aSign;
4730
    int32 aExp;
4731
    bits64 aSig0, aSig1;
4732

    
4733
    aSig1 = extractFloat128Frac1( a );
4734
    aSig0 = extractFloat128Frac0( a );
4735
    aExp = extractFloat128Exp( a );
4736
    aSign = extractFloat128Sign( a );
4737
    if ( aExp == 0x7FFF ) {
4738
        if ( aSig0 | aSig1 ) {
4739
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4740
        }
4741
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4742
    }
4743
    if ( aExp == 0 ) {
4744
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4745
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4746
    }
4747
    else {
4748
        aSig0 |= LIT64( 0x0001000000000000 );
4749
    }
4750
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4751
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4752

    
4753
}
4754

    
4755
#endif
4756

    
4757
/*----------------------------------------------------------------------------
4758
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4759
| returns the result as a quadruple-precision floating-point value.  The
4760
| operation is performed according to the IEC/IEEE Standard for Binary
4761
| Floating-Point Arithmetic.
4762
*----------------------------------------------------------------------------*/
4763

    
4764
float128 float128_round_to_int( float128 a STATUS_PARAM )
4765
{
4766
    flag aSign;
4767
    int32 aExp;
4768
    bits64 lastBitMask, roundBitsMask;
4769
    int8 roundingMode;
4770
    float128 z;
4771

    
4772
    aExp = extractFloat128Exp( a );
4773
    if ( 0x402F <= aExp ) {
4774
        if ( 0x406F <= aExp ) {
4775
            if (    ( aExp == 0x7FFF )
4776
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4777
               ) {
4778
                return propagateFloat128NaN( a, a STATUS_VAR );
4779
            }
4780
            return a;
4781
        }
4782
        lastBitMask = 1;
4783
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4784
        roundBitsMask = lastBitMask - 1;
4785
        z = a;
4786
        roundingMode = STATUS(float_rounding_mode);
4787
        if ( roundingMode == float_round_nearest_even ) {
4788
            if ( lastBitMask ) {
4789
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4790
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4791
            }
4792
            else {
4793
                if ( (sbits64) z.low < 0 ) {
4794
                    ++z.high;
4795
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4796
                }
4797
            }
4798
        }
4799
        else if ( roundingMode != float_round_to_zero ) {
4800
            if (   extractFloat128Sign( z )
4801
                 ^ ( roundingMode == float_round_up ) ) {
4802
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4803
            }
4804
        }
4805
        z.low &= ~ roundBitsMask;
4806
    }
4807
    else {
4808
        if ( aExp < 0x3FFF ) {
4809
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4810
            STATUS(float_exception_flags) |= float_flag_inexact;
4811
            aSign = extractFloat128Sign( a );
4812
            switch ( STATUS(float_rounding_mode) ) {
4813
             case float_round_nearest_even:
4814
                if (    ( aExp == 0x3FFE )
4815
                     && (   extractFloat128Frac0( a )
4816
                          | extractFloat128Frac1( a ) )
4817
                   ) {
4818
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4819
                }
4820
                break;
4821
             case float_round_down:
4822
                return
4823
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4824
                    : packFloat128( 0, 0, 0, 0 );
4825
             case float_round_up:
4826
                return
4827
                      aSign ? packFloat128( 1, 0, 0, 0 )
4828
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4829
            }
4830
            return packFloat128( aSign, 0, 0, 0 );
4831
        }
4832
        lastBitMask = 1;
4833
        lastBitMask <<= 0x402F - aExp;
4834
        roundBitsMask = lastBitMask - 1;
4835
        z.low = 0;
4836
        z.high = a.high;
4837
        roundingMode = STATUS(float_rounding_mode);
4838
        if ( roundingMode == float_round_nearest_even ) {
4839
            z.high += lastBitMask>>1;
4840
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4841
                z.high &= ~ lastBitMask;
4842
            }
4843
        }
4844
        else if ( roundingMode != float_round_to_zero ) {
4845
            if (   extractFloat128Sign( z )
4846
                 ^ ( roundingMode == float_round_up ) ) {
4847
                z.high |= ( a.low != 0 );
4848
                z.high += roundBitsMask;
4849
            }
4850
        }
4851
        z.high &= ~ roundBitsMask;
4852
    }
4853
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4854
        STATUS(float_exception_flags) |= float_flag_inexact;
4855
    }
4856
    return z;
4857

    
4858
}
4859

    
4860
/*----------------------------------------------------------------------------
4861
| Returns the result of adding the absolute values of the quadruple-precision
4862
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4863
| before being returned.  `zSign' is ignored if the result is a NaN.
4864
| The addition is performed according to the IEC/IEEE Standard for Binary
4865
| Floating-Point Arithmetic.
4866
*----------------------------------------------------------------------------*/
4867

    
4868
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4869
{
4870
    int32 aExp, bExp, zExp;
4871
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4872
    int32 expDiff;
4873

    
4874
    aSig1 = extractFloat128Frac1( a );
4875
    aSig0 = extractFloat128Frac0( a );
4876
    aExp = extractFloat128Exp( a );
4877
    bSig1 = extractFloat128Frac1( b );
4878
    bSig0 = extractFloat128Frac0( b );
4879
    bExp = extractFloat128Exp( b );
4880
    expDiff = aExp - bExp;
4881
    if ( 0 < expDiff ) {
4882
        if ( aExp == 0x7FFF ) {
4883
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4884
            return a;
4885
        }
4886
        if ( bExp == 0 ) {
4887
            --expDiff;
4888
        }
4889
        else {
4890
            bSig0 |= LIT64( 0x0001000000000000 );
4891
        }
4892
        shift128ExtraRightJamming(
4893
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4894
        zExp = aExp;
4895
    }
4896
    else if ( expDiff < 0 ) {
4897
        if ( bExp == 0x7FFF ) {
4898
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4899
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4900
        }
4901
        if ( aExp == 0 ) {
4902
            ++expDiff;
4903
        }
4904
        else {
4905
            aSig0 |= LIT64( 0x0001000000000000 );
4906
        }
4907
        shift128ExtraRightJamming(
4908
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4909
        zExp = bExp;
4910
    }
4911
    else {
4912
        if ( aExp == 0x7FFF ) {
4913
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4914
                return propagateFloat128NaN( a, b STATUS_VAR );
4915
            }
4916
            return a;
4917
        }
4918
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4919
        if ( aExp == 0 ) {
4920
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4921
            return packFloat128( zSign, 0, zSig0, zSig1 );
4922
        }
4923
        zSig2 = 0;
4924
        zSig0 |= LIT64( 0x0002000000000000 );
4925
        zExp = aExp;
4926
        goto shiftRight1;
4927
    }
4928
    aSig0 |= LIT64( 0x0001000000000000 );
4929
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4930
    --zExp;
4931
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4932
    ++zExp;
4933
 shiftRight1:
4934
    shift128ExtraRightJamming(
4935
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4936
 roundAndPack:
4937
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4938

    
4939
}
4940

    
4941
/*----------------------------------------------------------------------------
4942
| Returns the result of subtracting the absolute values of the quadruple-
4943
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4944
| difference is negated before being returned.  `zSign' is ignored if the
4945
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4946
| Standard for Binary Floating-Point Arithmetic.
4947
*----------------------------------------------------------------------------*/
4948

    
4949
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4950
{
4951
    int32 aExp, bExp, zExp;
4952
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4953
    int32 expDiff;
4954
    float128 z;
4955

    
4956
    aSig1 = extractFloat128Frac1( a );
4957
    aSig0 = extractFloat128Frac0( a );
4958
    aExp = extractFloat128Exp( a );
4959
    bSig1 = extractFloat128Frac1( b );
4960
    bSig0 = extractFloat128Frac0( b );
4961
    bExp = extractFloat128Exp( b );
4962
    expDiff = aExp - bExp;
4963
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4964
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4965
    if ( 0 < expDiff ) goto aExpBigger;
4966
    if ( expDiff < 0 ) goto bExpBigger;
4967
    if ( aExp == 0x7FFF ) {
4968
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4969
            return propagateFloat128NaN( a, b STATUS_VAR );
4970
        }
4971
        float_raise( float_flag_invalid STATUS_VAR);
4972
        z.low = float128_default_nan_low;
4973
        z.high = float128_default_nan_high;
4974
        return z;
4975
    }
4976
    if ( aExp == 0 ) {
4977
        aExp = 1;
4978
        bExp = 1;
4979
    }
4980
    if ( bSig0 < aSig0 ) goto aBigger;
4981
    if ( aSig0 < bSig0 ) goto bBigger;
4982
    if ( bSig1 < aSig1 ) goto aBigger;
4983
    if ( aSig1 < bSig1 ) goto bBigger;
4984
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4985
 bExpBigger:
4986
    if ( bExp == 0x7FFF ) {
4987
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4988
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4989
    }
4990
    if ( aExp == 0 ) {
4991
        ++expDiff;
4992
    }
4993
    else {
4994
        aSig0 |= LIT64( 0x4000000000000000 );
4995
    }
4996
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4997
    bSig0 |= LIT64( 0x4000000000000000 );
4998
 bBigger:
4999
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5000
    zExp = bExp;
5001
    zSign ^= 1;
5002
    goto normalizeRoundAndPack;
5003
 aExpBigger:
5004
    if ( aExp == 0x7FFF ) {
5005
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5006
        return a;
5007
    }
5008
    if ( bExp == 0 ) {
5009
        --expDiff;
5010
    }
5011
    else {
5012
        bSig0 |= LIT64( 0x4000000000000000 );
5013
    }
5014
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5015
    aSig0 |= LIT64( 0x4000000000000000 );
5016
 aBigger:
5017
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5018
    zExp = aExp;
5019
 normalizeRoundAndPack:
5020
    --zExp;
5021
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5022

    
5023
}
5024

    
5025
/*----------------------------------------------------------------------------
5026
| Returns the result of adding the quadruple-precision floating-point values
5027
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5028
| for Binary Floating-Point Arithmetic.
5029
*----------------------------------------------------------------------------*/
5030

    
5031
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5032
{
5033
    flag aSign, bSign;
5034

    
5035
    aSign = extractFloat128Sign( a );
5036
    bSign = extractFloat128Sign( b );
5037
    if ( aSign == bSign ) {
5038
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5039
    }
5040
    else {
5041
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5042
    }
5043

    
5044
}
5045

    
5046
/*----------------------------------------------------------------------------
5047
| Returns the result of subtracting the quadruple-precision floating-point
5048
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5049
| Standard for Binary Floating-Point Arithmetic.
5050
*----------------------------------------------------------------------------*/
5051

    
5052
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5053
{
5054
    flag aSign, bSign;
5055

    
5056
    aSign = extractFloat128Sign( a );
5057
    bSign = extractFloat128Sign( b );
5058
    if ( aSign == bSign ) {
5059
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5060
    }
5061
    else {
5062
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5063
    }
5064

    
5065
}
5066

    
5067
/*----------------------------------------------------------------------------
5068
| Returns the result of multiplying the quadruple-precision floating-point
5069
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5070
| Standard for Binary Floating-Point Arithmetic.
5071
*----------------------------------------------------------------------------*/
5072

    
5073
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5074
{
5075
    flag aSign, bSign, zSign;
5076
    int32 aExp, bExp, zExp;
5077
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5078
    float128 z;
5079

    
5080
    aSig1 = extractFloat128Frac1( a );
5081
    aSig0 = extractFloat128Frac0( a );
5082
    aExp = extractFloat128Exp( a );
5083
    aSign = extractFloat128Sign( a );
5084
    bSig1 = extractFloat128Frac1( b );
5085
    bSig0 = extractFloat128Frac0( b );
5086
    bExp = extractFloat128Exp( b );
5087
    bSign = extractFloat128Sign( b );
5088
    zSign = aSign ^ bSign;
5089
    if ( aExp == 0x7FFF ) {
5090
        if (    ( aSig0 | aSig1 )
5091
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5092
            return propagateFloat128NaN( a, b STATUS_VAR );
5093
        }
5094
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5095
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5096
    }
5097
    if ( bExp == 0x7FFF ) {
5098
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5099
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5100
 invalid:
5101
            float_raise( float_flag_invalid STATUS_VAR);
5102
            z.low = float128_default_nan_low;
5103
            z.high = float128_default_nan_high;
5104
            return z;
5105
        }
5106
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5107
    }
5108
    if ( aExp == 0 ) {
5109
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5110
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5111
    }
5112
    if ( bExp == 0 ) {
5113
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5114
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5115
    }
5116
    zExp = aExp + bExp - 0x4000;
5117
    aSig0 |= LIT64( 0x0001000000000000 );
5118
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5119
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5120
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5121
    zSig2 |= ( zSig3 != 0 );
5122
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5123
        shift128ExtraRightJamming(
5124
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5125
        ++zExp;
5126
    }
5127
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5128

    
5129
}
5130

    
5131
/*----------------------------------------------------------------------------
5132
| Returns the result of dividing the quadruple-precision floating-point value
5133
| `a' by the corresponding value `b'.  The operation is performed according to
5134
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5135
*----------------------------------------------------------------------------*/
5136

    
5137
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5138
{
5139
    flag aSign, bSign, zSign;
5140
    int32 aExp, bExp, zExp;
5141
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5142
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5143
    float128 z;
5144

    
5145
    aSig1 = extractFloat128Frac1( a );
5146
    aSig0 = extractFloat128Frac0( a );
5147
    aExp = extractFloat128Exp( a );
5148
    aSign = extractFloat128Sign( a );
5149
    bSig1 = extractFloat128Frac1( b );
5150
    bSig0 = extractFloat128Frac0( b );
5151
    bExp = extractFloat128Exp( b );
5152
    bSign = extractFloat128Sign( b );
5153
    zSign = aSign ^ bSign;
5154
    if ( aExp == 0x7FFF ) {
5155
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5156
        if ( bExp == 0x7FFF ) {
5157
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5158
            goto invalid;
5159
        }
5160
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5161
    }
5162
    if ( bExp == 0x7FFF ) {
5163
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5164
        return packFloat128( zSign, 0, 0, 0 );
5165
    }
5166
    if ( bExp == 0 ) {
5167
        if ( ( bSig0 | bSig1 ) == 0 ) {
5168
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5169
 invalid:
5170
                float_raise( float_flag_invalid STATUS_VAR);
5171
                z.low = float128_default_nan_low;
5172
                z.high = float128_default_nan_high;
5173
                return z;
5174
            }
5175
            float_raise( float_flag_divbyzero STATUS_VAR);
5176
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5177
        }
5178
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5179
    }
5180
    if ( aExp == 0 ) {
5181
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5182
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5183
    }
5184
    zExp = aExp - bExp + 0x3FFD;
5185
    shortShift128Left(
5186
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5187
    shortShift128Left(
5188
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5189
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5190
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5191
        ++zExp;
5192
    }
5193
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5194
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5195
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5196
    while ( (sbits64) rem0 < 0 ) {
5197
        --zSig0;
5198
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5199
    }
5200
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5201
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5202
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5203
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5204
        while ( (sbits64) rem1 < 0 ) {
5205
            --zSig1;
5206
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5207
        }
5208
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5209
    }
5210
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5211
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5212

    
5213
}
5214

    
5215
/*----------------------------------------------------------------------------
5216
| Returns the remainder of the quadruple-precision floating-point value `a'
5217
| with respect to the corresponding value `b'.  The operation is performed
5218
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219
*----------------------------------------------------------------------------*/
5220

    
5221
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5222
{
5223
    flag aSign, zSign;
5224
    int32 aExp, bExp, expDiff;
5225
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5226
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5227
    sbits64 sigMean0;
5228
    float128 z;
5229

    
5230
    aSig1 = extractFloat128Frac1( a );
5231
    aSig0 = extractFloat128Frac0( a );
5232
    aExp = extractFloat128Exp( a );
5233
    aSign = extractFloat128Sign( a );
5234
    bSig1 = extractFloat128Frac1( b );
5235
    bSig0 = extractFloat128Frac0( b );
5236
    bExp = extractFloat128Exp( b );
5237
    if ( aExp == 0x7FFF ) {
5238
        if (    ( aSig0 | aSig1 )
5239
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5240
            return propagateFloat128NaN( a, b STATUS_VAR );
5241
        }
5242
        goto invalid;
5243
    }
5244
    if ( bExp == 0x7FFF ) {
5245
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5246
        return a;
5247
    }
5248
    if ( bExp == 0 ) {
5249
        if ( ( bSig0 | bSig1 ) == 0 ) {
5250
 invalid:
5251
            float_raise( float_flag_invalid STATUS_VAR);
5252
            z.low = float128_default_nan_low;
5253
            z.high = float128_default_nan_high;
5254
            return z;
5255
        }
5256
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5257
    }
5258
    if ( aExp == 0 ) {
5259
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5260
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5261
    }
5262
    expDiff = aExp - bExp;
5263
    if ( expDiff < -1 ) return a;
5264
    shortShift128Left(
5265
        aSig0 | LIT64( 0x0001000000000000 ),
5266
        aSig1,
5267
        15 - ( expDiff < 0 ),
5268
        &aSig0,
5269
        &aSig1
5270
    );
5271
    shortShift128Left(
5272
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5273
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5274
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5275
    expDiff -= 64;
5276
    while ( 0 < expDiff ) {
5277
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5278
        q = ( 4 < q ) ? q - 4 : 0;
5279
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5280
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5281
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5282
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5283
        expDiff -= 61;
5284
    }
5285
    if ( -64 < expDiff ) {
5286
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5287
        q = ( 4 < q ) ? q - 4 : 0;
5288
        q >>= - expDiff;
5289
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5290
        expDiff += 52;
5291
        if ( expDiff < 0 ) {
5292
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5293
        }
5294
        else {
5295
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5296
        }
5297
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5298
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5299
    }
5300
    else {
5301
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5302
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5303
    }
5304
    do {
5305
        alternateASig0 = aSig0;
5306
        alternateASig1 = aSig1;
5307
        ++q;
5308
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5309
    } while ( 0 <= (sbits64) aSig0 );
5310
    add128(
5311
        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5312
    if (    ( sigMean0 < 0 )
5313
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5314
        aSig0 = alternateASig0;
5315
        aSig1 = alternateASig1;
5316
    }
5317
    zSign = ( (sbits64) aSig0 < 0 );
5318
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5319
    return
5320
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5321

    
5322
}
5323

    
5324
/*----------------------------------------------------------------------------
5325
| Returns the square root of the quadruple-precision floating-point value `a'.
5326
| The operation is performed according to the IEC/IEEE Standard for Binary
5327
| Floating-Point Arithmetic.
5328
*----------------------------------------------------------------------------*/
5329

    
5330
float128 float128_sqrt( float128 a STATUS_PARAM )
5331
{
5332
    flag aSign;
5333
    int32 aExp, zExp;
5334
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5335
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5336
    float128 z;
5337

    
5338
    aSig1 = extractFloat128Frac1( a );
5339
    aSig0 = extractFloat128Frac0( a );
5340
    aExp = extractFloat128Exp( a );
5341
    aSign = extractFloat128Sign( a );
5342
    if ( aExp == 0x7FFF ) {
5343
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5344
        if ( ! aSign ) return a;
5345
        goto invalid;
5346
    }
5347
    if ( aSign ) {
5348
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5349
 invalid:
5350
        float_raise( float_flag_invalid STATUS_VAR);
5351
        z.low = float128_default_nan_low;
5352
        z.high = float128_default_nan_high;
5353
        return z;
5354
    }
5355
    if ( aExp == 0 ) {
5356
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5357
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5358
    }
5359
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5360
    aSig0 |= LIT64( 0x0001000000000000 );
5361
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5362
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5363
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5364
    doubleZSig0 = zSig0<<1;
5365
    mul64To128( zSig0, zSig0, &term0, &term1 );
5366
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5367
    while ( (sbits64) rem0 < 0 ) {
5368
        --zSig0;
5369
        doubleZSig0 -= 2;
5370
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5371
    }
5372
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5373
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5374
        if ( zSig1 == 0 ) zSig1 = 1;
5375
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5376
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5377
        mul64To128( zSig1, zSig1, &term2, &term3 );
5378
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5379
        while ( (sbits64) rem1 < 0 ) {
5380
            --zSig1;
5381
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5382
            term3 |= 1;
5383
            term2 |= doubleZSig0;
5384
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5385
        }
5386
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5387
    }
5388
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5389
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5390

    
5391
}
5392

    
5393
/*----------------------------------------------------------------------------
5394
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5395
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5396
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5397
*----------------------------------------------------------------------------*/
5398

    
5399
int float128_eq( float128 a, float128 b STATUS_PARAM )
5400
{
5401

    
5402
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5403
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5404
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5405
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5406
       ) {
5407
        if (    float128_is_signaling_nan( a )
5408
             || float128_is_signaling_nan( b ) ) {
5409
            float_raise( float_flag_invalid STATUS_VAR);
5410
        }
5411
        return 0;
5412
    }
5413
    return
5414
           ( a.low == b.low )
5415
        && (    ( a.high == b.high )
5416
             || (    ( a.low == 0 )
5417
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5418
           );
5419

    
5420
}
5421

    
5422
/*----------------------------------------------------------------------------
5423
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5424
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5425
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5426
| Arithmetic.
5427
*----------------------------------------------------------------------------*/
5428

    
5429
int float128_le( float128 a, float128 b STATUS_PARAM )
5430
{
5431
    flag aSign, bSign;
5432

    
5433
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5434
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5435
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5436
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5437
       ) {
5438
        float_raise( float_flag_invalid STATUS_VAR);
5439
        return 0;
5440
    }
5441
    aSign = extractFloat128Sign( a );
5442
    bSign = extractFloat128Sign( b );
5443
    if ( aSign != bSign ) {
5444
        return
5445
               aSign
5446
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5447
                 == 0 );
5448
    }
5449
    return
5450
          aSign ? le128( b.high, b.low, a.high, a.low )
5451
        : le128( a.high, a.low, b.high, b.low );
5452

    
5453
}
5454

    
5455
/*----------------------------------------------------------------------------
5456
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5457
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5458
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5459
*----------------------------------------------------------------------------*/
5460

    
5461
int float128_lt( float128 a, float128 b STATUS_PARAM )
5462
{
5463
    flag aSign, bSign;
5464

    
5465
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5466
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5467
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5468
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5469
       ) {
5470
        float_raise( float_flag_invalid STATUS_VAR);
5471
        return 0;
5472
    }
5473
    aSign = extractFloat128Sign( a );
5474
    bSign = extractFloat128Sign( b );
5475
    if ( aSign != bSign ) {
5476
        return
5477
               aSign
5478
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5479
                 != 0 );
5480
    }
5481
    return
5482
          aSign ? lt128( b.high, b.low, a.high, a.low )
5483
        : lt128( a.high, a.low, b.high, b.low );
5484

    
5485
}
5486

    
5487
/*----------------------------------------------------------------------------
5488
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5489
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5490
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5491
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5492
*----------------------------------------------------------------------------*/
5493

    
5494
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5495
{
5496

    
5497
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5498
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5499
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5500
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5501
       ) {
5502
        float_raise( float_flag_invalid STATUS_VAR);
5503
        return 0;
5504
    }
5505
    return
5506
           ( a.low == b.low )
5507
        && (    ( a.high == b.high )
5508
             || (    ( a.low == 0 )
5509
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5510
           );
5511

    
5512
}
5513

    
5514
/*----------------------------------------------------------------------------
5515
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5516
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5517
| cause an exception.  Otherwise, the comparison is performed according to the
5518
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5519
*----------------------------------------------------------------------------*/
5520

    
5521
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5522
{
5523
    flag aSign, bSign;
5524

    
5525
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5526
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5527
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5528
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5529
       ) {
5530
        if (    float128_is_signaling_nan( a )
5531
             || float128_is_signaling_nan( b ) ) {
5532
            float_raise( float_flag_invalid STATUS_VAR);
5533
        }
5534
        return 0;
5535
    }
5536
    aSign = extractFloat128Sign( a );
5537
    bSign = extractFloat128Sign( b );
5538
    if ( aSign != bSign ) {
5539
        return
5540
               aSign
5541
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5542
                 == 0 );
5543
    }
5544
    return
5545
          aSign ? le128( b.high, b.low, a.high, a.low )
5546
        : le128( a.high, a.low, b.high, b.low );
5547

    
5548
}
5549

    
5550
/*----------------------------------------------------------------------------
5551
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5552
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5553
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5554
| Standard for Binary Floating-Point Arithmetic.
5555
*----------------------------------------------------------------------------*/
5556

    
5557
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5558
{
5559
    flag aSign, bSign;
5560

    
5561
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5562
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5563
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5564
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5565
       ) {
5566
        if (    float128_is_signaling_nan( a )
5567
             || float128_is_signaling_nan( b ) ) {
5568
            float_raise( float_flag_invalid STATUS_VAR);
5569
        }
5570
        return 0;
5571
    }
5572
    aSign = extractFloat128Sign( a );
5573
    bSign = extractFloat128Sign( b );
5574
    if ( aSign != bSign ) {
5575
        return
5576
               aSign
5577
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5578
                 != 0 );
5579
    }
5580
    return
5581
          aSign ? lt128( b.high, b.low, a.high, a.low )
5582
        : lt128( a.high, a.low, b.high, b.low );
5583

    
5584
}
5585

    
5586
#endif
5587

    
5588
/* misc functions */
5589
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5590
{
5591
    return int64_to_float32(a STATUS_VAR);
5592
}
5593

    
5594
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5595
{
5596
    return int64_to_float64(a STATUS_VAR);
5597
}
5598

    
5599
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5600
{
5601
    int64_t v;
5602
    unsigned int res;
5603

    
5604
    v = float32_to_int64(a STATUS_VAR);
5605
    if (v < 0) {
5606
        res = 0;
5607
        float_raise( float_flag_invalid STATUS_VAR);
5608
    } else if (v > 0xffffffff) {
5609
        res = 0xffffffff;
5610
        float_raise( float_flag_invalid STATUS_VAR);
5611
    } else {
5612
        res = v;
5613
    }
5614
    return res;
5615
}
5616

    
5617
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5618
{
5619
    int64_t v;
5620
    unsigned int res;
5621

    
5622
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5623
    if (v < 0) {
5624
        res = 0;
5625
        float_raise( float_flag_invalid STATUS_VAR);
5626
    } else if (v > 0xffffffff) {
5627
        res = 0xffffffff;
5628
        float_raise( float_flag_invalid STATUS_VAR);
5629
    } else {
5630
        res = v;
5631
    }
5632
    return res;
5633
}
5634

    
5635
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5636
{
5637
    int64_t v;
5638
    unsigned int res;
5639

    
5640
    v = float64_to_int64(a STATUS_VAR);
5641
    if (v < 0) {
5642
        res = 0;
5643
        float_raise( float_flag_invalid STATUS_VAR);
5644
    } else if (v > 0xffffffff) {
5645
        res = 0xffffffff;
5646
        float_raise( float_flag_invalid STATUS_VAR);
5647
    } else {
5648
        res = v;
5649
    }
5650
    return res;
5651
}
5652

    
5653
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5654
{
5655
    int64_t v;
5656
    unsigned int res;
5657

    
5658
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5659
    if (v < 0) {
5660
        res = 0;
5661
        float_raise( float_flag_invalid STATUS_VAR);
5662
    } else if (v > 0xffffffff) {
5663
        res = 0xffffffff;
5664
        float_raise( float_flag_invalid STATUS_VAR);
5665
    } else {
5666
        res = v;
5667
    }
5668
    return res;
5669
}
5670

    
5671
/* FIXME: This looks broken.  */
5672
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5673
{
5674
    int64_t v;
5675

    
5676
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5677
    v += float64_val(a);
5678
    v = float64_to_int64(make_float64(v) STATUS_VAR);
5679

    
5680
    return v - INT64_MIN;
5681
}
5682

    
5683
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5684
{
5685
    int64_t v;
5686

    
5687
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5688
    v += float64_val(a);
5689
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5690

    
5691
    return v - INT64_MIN;
5692
}
5693

    
5694
#define COMPARE(s, nan_exp)                                                  \
5695
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5696
                                      int is_quiet STATUS_PARAM )            \
5697
{                                                                            \
5698
    flag aSign, bSign;                                                       \
5699
    bits ## s av, bv;                                                        \
5700
                                                                             \
5701
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5702
         extractFloat ## s ## Frac( a ) ) ||                                 \
5703
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5704
          extractFloat ## s ## Frac( b ) )) {                                \
5705
        if (!is_quiet ||                                                     \
5706
            float ## s ## _is_signaling_nan( a ) ||                          \
5707
            float ## s ## _is_signaling_nan( b ) ) {                         \
5708
            float_raise( float_flag_invalid STATUS_VAR);                     \
5709
        }                                                                    \
5710
        return float_relation_unordered;                                     \
5711
    }                                                                        \
5712
    aSign = extractFloat ## s ## Sign( a );                                  \
5713
    bSign = extractFloat ## s ## Sign( b );                                  \
5714
    av = float ## s ## _val(a);                                              \
5715
    bv = float ## s ## _val(b);                                              \
5716
    if ( aSign != bSign ) {                                                  \
5717
        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5718
            /* zero case */                                                  \
5719
            return float_relation_equal;                                     \
5720
        } else {                                                             \
5721
            return 1 - (2 * aSign);                                          \
5722
        }                                                                    \
5723
    } else {                                                                 \
5724
        if (av == bv) {                                                      \
5725
            return float_relation_equal;                                     \
5726
        } else {                                                             \
5727
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5728
        }                                                                    \
5729
    }                                                                        \
5730
}                                                                            \
5731
                                                                             \
5732
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5733
{                                                                            \
5734
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5735
}                                                                            \
5736
                                                                             \
5737
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5738
{                                                                            \
5739
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5740
}
5741

    
5742
COMPARE(32, 0xff)
5743
COMPARE(64, 0x7ff)
5744

    
5745
INLINE int float128_compare_internal( float128 a, float128 b,
5746
                                      int is_quiet STATUS_PARAM )
5747
{
5748
    flag aSign, bSign;
5749

    
5750
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5751
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5752
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5753
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5754
        if (!is_quiet ||
5755
            float128_is_signaling_nan( a ) ||
5756
            float128_is_signaling_nan( b ) ) {
5757
            float_raise( float_flag_invalid STATUS_VAR);
5758
        }
5759
        return float_relation_unordered;
5760
    }
5761
    aSign = extractFloat128Sign( a );
5762
    bSign = extractFloat128Sign( b );
5763
    if ( aSign != bSign ) {
5764
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5765
            /* zero case */
5766
            return float_relation_equal;
5767
        } else {
5768
            return 1 - (2 * aSign);
5769
        }
5770
    } else {
5771
        if (a.low == b.low && a.high == b.high) {
5772
            return float_relation_equal;
5773
        } else {
5774
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5775
        }
5776
    }
5777
}
5778

    
5779
int float128_compare( float128 a, float128 b STATUS_PARAM )
5780
{
5781
    return float128_compare_internal(a, b, 0 STATUS_VAR);
5782
}
5783

    
5784
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5785
{
5786
    return float128_compare_internal(a, b, 1 STATUS_VAR);
5787
}
5788

    
5789
/* Multiply A by 2 raised to the power N.  */
5790
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5791
{
5792
    flag aSign;
5793
    int16 aExp;
5794
    bits32 aSig;
5795

    
5796
    aSig = extractFloat32Frac( a );
5797
    aExp = extractFloat32Exp( a );
5798
    aSign = extractFloat32Sign( a );
5799

    
5800
    if ( aExp == 0xFF ) {
5801
        return a;
5802
    }
5803
    if ( aExp != 0 )
5804
        aSig |= 0x00800000;
5805
    else if ( aSig == 0 )
5806
        return a;
5807

    
5808
    aExp += n - 1;
5809
    aSig <<= 7;
5810
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5811
}
5812

    
5813
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5814
{
5815
    flag aSign;
5816
    int16 aExp;
5817
    bits64 aSig;
5818

    
5819
    aSig = extractFloat64Frac( a );
5820
    aExp = extractFloat64Exp( a );
5821
    aSign = extractFloat64Sign( a );
5822

    
5823
    if ( aExp == 0x7FF ) {
5824
        return a;
5825
    }
5826
    if ( aExp != 0 )
5827
        aSig |= LIT64( 0x0010000000000000 );
5828
    else if ( aSig == 0 )
5829
        return a;
5830

    
5831
    aExp += n - 1;
5832
    aSig <<= 10;
5833
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5834
}
5835

    
5836
#ifdef FLOATX80
5837
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5838
{
5839
    flag aSign;
5840
    int16 aExp;
5841
    bits64 aSig;
5842

    
5843
    aSig = extractFloatx80Frac( a );
5844
    aExp = extractFloatx80Exp( a );
5845
    aSign = extractFloatx80Sign( a );
5846

    
5847
    if ( aExp == 0x7FF ) {
5848
        return a;
5849
    }
5850
    if (aExp == 0 && aSig == 0)
5851
        return a;
5852

    
5853
    aExp += n;
5854
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5855
                                          aSign, aExp, aSig, 0 STATUS_VAR );
5856
}
5857
#endif
5858

    
5859
#ifdef FLOAT128
5860
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5861
{
5862
    flag aSign;
5863
    int32 aExp;
5864
    bits64 aSig0, aSig1;
5865

    
5866
    aSig1 = extractFloat128Frac1( a );
5867
    aSig0 = extractFloat128Frac0( a );
5868
    aExp = extractFloat128Exp( a );
5869
    aSign = extractFloat128Sign( a );
5870
    if ( aExp == 0x7FFF ) {
5871
        return a;
5872
    }
5873
    if ( aExp != 0 )
5874
        aSig0 |= LIT64( 0x0001000000000000 );
5875
    else if ( aSig0 == 0 && aSig1 == 0 )
5876
        return a;
5877

    
5878
    aExp += n - 1;
5879
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5880
                                          STATUS_VAR );
5881

    
5882
}
5883
#endif