Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ a63b5829

History | View | Annotate | Download (202 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
/* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
34
   be flushed to zero.  */
35
#include "softfloat.h"
36

    
37
/*----------------------------------------------------------------------------
38
| Primitive arithmetic functions, including multi-word arithmetic, and
39
| division and square root approximations.  (Can be specialized to target if
40
| desired.)
41
*----------------------------------------------------------------------------*/
42
#include "softfloat-macros.h"
43

    
44
/*----------------------------------------------------------------------------
45
| Functions and definitions to determine:  (1) whether tininess for underflow
46
| is detected before or after rounding by default, (2) what (if anything)
47
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
48
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49
| are propagated from function inputs to output.  These details are target-
50
| specific.
51
*----------------------------------------------------------------------------*/
52
#include "softfloat-specialize.h"
53

    
54
void set_float_rounding_mode(int val STATUS_PARAM)
55
{
56
    STATUS(float_rounding_mode) = val;
57
}
58

    
59
void set_float_exception_flags(int val STATUS_PARAM)
60
{
61
    STATUS(float_exception_flags) = val;
62
}
63

    
64
#ifdef FLOATX80
65
void set_floatx80_rounding_precision(int val STATUS_PARAM)
66
{
67
    STATUS(floatx80_rounding_precision) = val;
68
}
69
#endif
70

    
71
/*----------------------------------------------------------------------------
72
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73
| and 7, and returns the properly rounded 32-bit integer corresponding to the
74
| input.  If `zSign' is 1, the input is negated before being converted to an
75
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
76
| is simply rounded to an integer, with the inexact exception raised if the
77
| input cannot be represented exactly as an integer.  However, if the fixed-
78
| point input is too large, the invalid exception is raised and the largest
79
| positive or negative integer is returned.
80
*----------------------------------------------------------------------------*/
81

    
82
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83
{
84
    int8 roundingMode;
85
    flag roundNearestEven;
86
    int8 roundIncrement, roundBits;
87
    int32 z;
88

    
89
    roundingMode = STATUS(float_rounding_mode);
90
    roundNearestEven = ( roundingMode == float_round_nearest_even );
91
    roundIncrement = 0x40;
92
    if ( ! roundNearestEven ) {
93
        if ( roundingMode == float_round_to_zero ) {
94
            roundIncrement = 0;
95
        }
96
        else {
97
            roundIncrement = 0x7F;
98
            if ( zSign ) {
99
                if ( roundingMode == float_round_up ) roundIncrement = 0;
100
            }
101
            else {
102
                if ( roundingMode == float_round_down ) roundIncrement = 0;
103
            }
104
        }
105
    }
106
    roundBits = absZ & 0x7F;
107
    absZ = ( absZ + roundIncrement )>>7;
108
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109
    z = absZ;
110
    if ( zSign ) z = - z;
111
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112
        float_raise( float_flag_invalid STATUS_VAR);
113
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114
    }
115
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116
    return z;
117

    
118
}
119

    
120
/*----------------------------------------------------------------------------
121
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122
| `absZ1', with binary point between bits 63 and 64 (between the input words),
123
| and returns the properly rounded 64-bit integer corresponding to the input.
124
| If `zSign' is 1, the input is negated before being converted to an integer.
125
| Ordinarily, the fixed-point input is simply rounded to an integer, with
126
| the inexact exception raised if the input cannot be represented exactly as
127
| an integer.  However, if the fixed-point input is too large, the invalid
128
| exception is raised and the largest positive or negative integer is
129
| returned.
130
*----------------------------------------------------------------------------*/
131

    
132
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133
{
134
    int8 roundingMode;
135
    flag roundNearestEven, increment;
136
    int64 z;
137

    
138
    roundingMode = STATUS(float_rounding_mode);
139
    roundNearestEven = ( roundingMode == float_round_nearest_even );
140
    increment = ( (sbits64) absZ1 < 0 );
141
    if ( ! roundNearestEven ) {
142
        if ( roundingMode == float_round_to_zero ) {
143
            increment = 0;
144
        }
145
        else {
146
            if ( zSign ) {
147
                increment = ( roundingMode == float_round_down ) && absZ1;
148
            }
149
            else {
150
                increment = ( roundingMode == float_round_up ) && absZ1;
151
            }
152
        }
153
    }
154
    if ( increment ) {
155
        ++absZ0;
156
        if ( absZ0 == 0 ) goto overflow;
157
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158
    }
159
    z = absZ0;
160
    if ( zSign ) z = - z;
161
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
162
 overflow:
163
        float_raise( float_flag_invalid STATUS_VAR);
164
        return
165
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
166
            : LIT64( 0x7FFFFFFFFFFFFFFF );
167
    }
168
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169
    return z;
170

    
171
}
172

    
173
/*----------------------------------------------------------------------------
174
| Returns the fraction bits of the single-precision floating-point value `a'.
175
*----------------------------------------------------------------------------*/
176

    
177
INLINE bits32 extractFloat32Frac( float32 a )
178
{
179

    
180
    return float32_val(a) & 0x007FFFFF;
181

    
182
}
183

    
184
/*----------------------------------------------------------------------------
185
| Returns the exponent bits of the single-precision floating-point value `a'.
186
*----------------------------------------------------------------------------*/
187

    
188
INLINE int16 extractFloat32Exp( float32 a )
189
{
190

    
191
    return ( float32_val(a)>>23 ) & 0xFF;
192

    
193
}
194

    
195
/*----------------------------------------------------------------------------
196
| Returns the sign bit of the single-precision floating-point value `a'.
197
*----------------------------------------------------------------------------*/
198

    
199
INLINE flag extractFloat32Sign( float32 a )
200
{
201

    
202
    return float32_val(a)>>31;
203

    
204
}
205

    
206
/*----------------------------------------------------------------------------
207
| Normalizes the subnormal single-precision floating-point value represented
208
| by the denormalized significand `aSig'.  The normalized exponent and
209
| significand are stored at the locations pointed to by `zExpPtr' and
210
| `zSigPtr', respectively.
211
*----------------------------------------------------------------------------*/
212

    
213
static void
214
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
215
{
216
    int8 shiftCount;
217

    
218
    shiftCount = countLeadingZeros32( aSig ) - 8;
219
    *zSigPtr = aSig<<shiftCount;
220
    *zExpPtr = 1 - shiftCount;
221

    
222
}
223

    
224
/*----------------------------------------------------------------------------
225
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226
| single-precision floating-point value, returning the result.  After being
227
| shifted into the proper positions, the three fields are simply added
228
| together to form the result.  This means that any integer portion of `zSig'
229
| will be added into the exponent.  Since a properly normalized significand
230
| will have an integer portion equal to 1, the `zExp' input should be 1 less
231
| than the desired result exponent whenever `zSig' is a complete, normalized
232
| significand.
233
*----------------------------------------------------------------------------*/
234

    
235
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236
{
237

    
238
    return make_float32(
239
          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240

    
241
}
242

    
243
/*----------------------------------------------------------------------------
244
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245
| and significand `zSig', and returns the proper single-precision floating-
246
| point value corresponding to the abstract input.  Ordinarily, the abstract
247
| value is simply rounded and packed into the single-precision format, with
248
| the inexact exception raised if the abstract input cannot be represented
249
| exactly.  However, if the abstract value is too large, the overflow and
250
| inexact exceptions are raised and an infinity or maximal finite value is
251
| returned.  If the abstract value is too small, the input value is rounded to
252
| a subnormal number, and the underflow and inexact exceptions are raised if
253
| the abstract input cannot be represented exactly as a subnormal single-
254
| precision floating-point number.
255
|     The input significand `zSig' has its binary point between bits 30
256
| and 29, which is 7 bits to the left of the usual location.  This shifted
257
| significand must be normalized or smaller.  If `zSig' is not normalized,
258
| `zExp' must be 0; in that case, the result returned is a subnormal number,
259
| and it must not require rounding.  In the usual case that `zSig' is
260
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261
| The handling of underflow and overflow follows the IEC/IEEE Standard for
262
| Binary Floating-Point Arithmetic.
263
*----------------------------------------------------------------------------*/
264

    
265
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266
{
267
    int8 roundingMode;
268
    flag roundNearestEven;
269
    int8 roundIncrement, roundBits;
270
    flag isTiny;
271

    
272
    roundingMode = STATUS(float_rounding_mode);
273
    roundNearestEven = ( roundingMode == float_round_nearest_even );
274
    roundIncrement = 0x40;
275
    if ( ! roundNearestEven ) {
276
        if ( roundingMode == float_round_to_zero ) {
277
            roundIncrement = 0;
278
        }
279
        else {
280
            roundIncrement = 0x7F;
281
            if ( zSign ) {
282
                if ( roundingMode == float_round_up ) roundIncrement = 0;
283
            }
284
            else {
285
                if ( roundingMode == float_round_down ) roundIncrement = 0;
286
            }
287
        }
288
    }
289
    roundBits = zSig & 0x7F;
290
    if ( 0xFD <= (bits16) zExp ) {
291
        if (    ( 0xFD < zExp )
292
             || (    ( zExp == 0xFD )
293
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294
           ) {
295
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297
        }
298
        if ( zExp < 0 ) {
299
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300
            isTiny =
301
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302
                || ( zExp < -1 )
303
                || ( zSig + roundIncrement < 0x80000000 );
304
            shift32RightJamming( zSig, - zExp, &zSig );
305
            zExp = 0;
306
            roundBits = zSig & 0x7F;
307
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308
        }
309
    }
310
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311
    zSig = ( zSig + roundIncrement )>>7;
312
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313
    if ( zSig == 0 ) zExp = 0;
314
    return packFloat32( zSign, zExp, zSig );
315

    
316
}
317

    
318
/*----------------------------------------------------------------------------
319
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320
| and significand `zSig', and returns the proper single-precision floating-
321
| point value corresponding to the abstract input.  This routine is just like
322
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324
| floating-point exponent.
325
*----------------------------------------------------------------------------*/
326

    
327
static float32
328
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329
{
330
    int8 shiftCount;
331

    
332
    shiftCount = countLeadingZeros32( zSig ) - 1;
333
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334

    
335
}
336

    
337
/*----------------------------------------------------------------------------
338
| Returns the fraction bits of the double-precision floating-point value `a'.
339
*----------------------------------------------------------------------------*/
340

    
341
INLINE bits64 extractFloat64Frac( float64 a )
342
{
343

    
344
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345

    
346
}
347

    
348
/*----------------------------------------------------------------------------
349
| Returns the exponent bits of the double-precision floating-point value `a'.
350
*----------------------------------------------------------------------------*/
351

    
352
INLINE int16 extractFloat64Exp( float64 a )
353
{
354

    
355
    return ( float64_val(a)>>52 ) & 0x7FF;
356

    
357
}
358

    
359
/*----------------------------------------------------------------------------
360
| Returns the sign bit of the double-precision floating-point value `a'.
361
*----------------------------------------------------------------------------*/
362

    
363
INLINE flag extractFloat64Sign( float64 a )
364
{
365

    
366
    return float64_val(a)>>63;
367

    
368
}
369

    
370
/*----------------------------------------------------------------------------
371
| Normalizes the subnormal double-precision floating-point value represented
372
| by the denormalized significand `aSig'.  The normalized exponent and
373
| significand are stored at the locations pointed to by `zExpPtr' and
374
| `zSigPtr', respectively.
375
*----------------------------------------------------------------------------*/
376

    
377
static void
378
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
379
{
380
    int8 shiftCount;
381

    
382
    shiftCount = countLeadingZeros64( aSig ) - 11;
383
    *zSigPtr = aSig<<shiftCount;
384
    *zExpPtr = 1 - shiftCount;
385

    
386
}
387

    
388
/*----------------------------------------------------------------------------
389
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390
| double-precision floating-point value, returning the result.  After being
391
| shifted into the proper positions, the three fields are simply added
392
| together to form the result.  This means that any integer portion of `zSig'
393
| will be added into the exponent.  Since a properly normalized significand
394
| will have an integer portion equal to 1, the `zExp' input should be 1 less
395
| than the desired result exponent whenever `zSig' is a complete, normalized
396
| significand.
397
*----------------------------------------------------------------------------*/
398

    
399
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400
{
401

    
402
    return make_float64(
403
        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404

    
405
}
406

    
407
/*----------------------------------------------------------------------------
408
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409
| and significand `zSig', and returns the proper double-precision floating-
410
| point value corresponding to the abstract input.  Ordinarily, the abstract
411
| value is simply rounded and packed into the double-precision format, with
412
| the inexact exception raised if the abstract input cannot be represented
413
| exactly.  However, if the abstract value is too large, the overflow and
414
| inexact exceptions are raised and an infinity or maximal finite value is
415
| returned.  If the abstract value is too small, the input value is rounded
416
| to a subnormal number, and the underflow and inexact exceptions are raised
417
| if the abstract input cannot be represented exactly as a subnormal double-
418
| precision floating-point number.
419
|     The input significand `zSig' has its binary point between bits 62
420
| and 61, which is 10 bits to the left of the usual location.  This shifted
421
| significand must be normalized or smaller.  If `zSig' is not normalized,
422
| `zExp' must be 0; in that case, the result returned is a subnormal number,
423
| and it must not require rounding.  In the usual case that `zSig' is
424
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425
| The handling of underflow and overflow follows the IEC/IEEE Standard for
426
| Binary Floating-Point Arithmetic.
427
*----------------------------------------------------------------------------*/
428

    
429
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430
{
431
    int8 roundingMode;
432
    flag roundNearestEven;
433
    int16 roundIncrement, roundBits;
434
    flag isTiny;
435

    
436
    roundingMode = STATUS(float_rounding_mode);
437
    roundNearestEven = ( roundingMode == float_round_nearest_even );
438
    roundIncrement = 0x200;
439
    if ( ! roundNearestEven ) {
440
        if ( roundingMode == float_round_to_zero ) {
441
            roundIncrement = 0;
442
        }
443
        else {
444
            roundIncrement = 0x3FF;
445
            if ( zSign ) {
446
                if ( roundingMode == float_round_up ) roundIncrement = 0;
447
            }
448
            else {
449
                if ( roundingMode == float_round_down ) roundIncrement = 0;
450
            }
451
        }
452
    }
453
    roundBits = zSig & 0x3FF;
454
    if ( 0x7FD <= (bits16) zExp ) {
455
        if (    ( 0x7FD < zExp )
456
             || (    ( zExp == 0x7FD )
457
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458
           ) {
459
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461
        }
462
        if ( zExp < 0 ) {
463
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464
            isTiny =
465
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466
                || ( zExp < -1 )
467
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468
            shift64RightJamming( zSig, - zExp, &zSig );
469
            zExp = 0;
470
            roundBits = zSig & 0x3FF;
471
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472
        }
473
    }
474
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475
    zSig = ( zSig + roundIncrement )>>10;
476
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477
    if ( zSig == 0 ) zExp = 0;
478
    return packFloat64( zSign, zExp, zSig );
479

    
480
}
481

    
482
/*----------------------------------------------------------------------------
483
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484
| and significand `zSig', and returns the proper double-precision floating-
485
| point value corresponding to the abstract input.  This routine is just like
486
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488
| floating-point exponent.
489
*----------------------------------------------------------------------------*/
490

    
491
static float64
492
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493
{
494
    int8 shiftCount;
495

    
496
    shiftCount = countLeadingZeros64( zSig ) - 1;
497
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498

    
499
}
500

    
501
#ifdef FLOATX80
502

    
503
/*----------------------------------------------------------------------------
504
| Returns the fraction bits of the extended double-precision floating-point
505
| value `a'.
506
*----------------------------------------------------------------------------*/
507

    
508
INLINE bits64 extractFloatx80Frac( floatx80 a )
509
{
510

    
511
    return a.low;
512

    
513
}
514

    
515
/*----------------------------------------------------------------------------
516
| Returns the exponent bits of the extended double-precision floating-point
517
| value `a'.
518
*----------------------------------------------------------------------------*/
519

    
520
INLINE int32 extractFloatx80Exp( floatx80 a )
521
{
522

    
523
    return a.high & 0x7FFF;
524

    
525
}
526

    
527
/*----------------------------------------------------------------------------
528
| Returns the sign bit of the extended double-precision floating-point value
529
| `a'.
530
*----------------------------------------------------------------------------*/
531

    
532
INLINE flag extractFloatx80Sign( floatx80 a )
533
{
534

    
535
    return a.high>>15;
536

    
537
}
538

    
539
/*----------------------------------------------------------------------------
540
| Normalizes the subnormal extended double-precision floating-point value
541
| represented by the denormalized significand `aSig'.  The normalized exponent
542
| and significand are stored at the locations pointed to by `zExpPtr' and
543
| `zSigPtr', respectively.
544
*----------------------------------------------------------------------------*/
545

    
546
static void
547
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
548
{
549
    int8 shiftCount;
550

    
551
    shiftCount = countLeadingZeros64( aSig );
552
    *zSigPtr = aSig<<shiftCount;
553
    *zExpPtr = 1 - shiftCount;
554

    
555
}
556

    
557
/*----------------------------------------------------------------------------
558
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559
| extended double-precision floating-point value, returning the result.
560
*----------------------------------------------------------------------------*/
561

    
562
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563
{
564
    floatx80 z;
565

    
566
    z.low = zSig;
567
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568
    return z;
569

    
570
}
571

    
572
/*----------------------------------------------------------------------------
573
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
575
| and returns the proper extended double-precision floating-point value
576
| corresponding to the abstract input.  Ordinarily, the abstract value is
577
| rounded and packed into the extended double-precision format, with the
578
| inexact exception raised if the abstract input cannot be represented
579
| exactly.  However, if the abstract value is too large, the overflow and
580
| inexact exceptions are raised and an infinity or maximal finite value is
581
| returned.  If the abstract value is too small, the input value is rounded to
582
| a subnormal number, and the underflow and inexact exceptions are raised if
583
| the abstract input cannot be represented exactly as a subnormal extended
584
| double-precision floating-point number.
585
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
586
| number of bits as single or double precision, respectively.  Otherwise, the
587
| result is rounded to the full precision of the extended double-precision
588
| format.
589
|     The input significand must be normalized or smaller.  If the input
590
| significand is not normalized, `zExp' must be 0; in that case, the result
591
| returned is a subnormal number, and it must not require rounding.  The
592
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593
| Floating-Point Arithmetic.
594
*----------------------------------------------------------------------------*/
595

    
596
static floatx80
597
 roundAndPackFloatx80(
598
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599
 STATUS_PARAM)
600
{
601
    int8 roundingMode;
602
    flag roundNearestEven, increment, isTiny;
603
    int64 roundIncrement, roundMask, roundBits;
604

    
605
    roundingMode = STATUS(float_rounding_mode);
606
    roundNearestEven = ( roundingMode == float_round_nearest_even );
607
    if ( roundingPrecision == 80 ) goto precision80;
608
    if ( roundingPrecision == 64 ) {
609
        roundIncrement = LIT64( 0x0000000000000400 );
610
        roundMask = LIT64( 0x00000000000007FF );
611
    }
612
    else if ( roundingPrecision == 32 ) {
613
        roundIncrement = LIT64( 0x0000008000000000 );
614
        roundMask = LIT64( 0x000000FFFFFFFFFF );
615
    }
616
    else {
617
        goto precision80;
618
    }
619
    zSig0 |= ( zSig1 != 0 );
620
    if ( ! roundNearestEven ) {
621
        if ( roundingMode == float_round_to_zero ) {
622
            roundIncrement = 0;
623
        }
624
        else {
625
            roundIncrement = roundMask;
626
            if ( zSign ) {
627
                if ( roundingMode == float_round_up ) roundIncrement = 0;
628
            }
629
            else {
630
                if ( roundingMode == float_round_down ) roundIncrement = 0;
631
            }
632
        }
633
    }
634
    roundBits = zSig0 & roundMask;
635
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636
        if (    ( 0x7FFE < zExp )
637
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638
           ) {
639
            goto overflow;
640
        }
641
        if ( zExp <= 0 ) {
642
            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643
            isTiny =
644
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645
                || ( zExp < 0 )
646
                || ( zSig0 <= zSig0 + roundIncrement );
647
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648
            zExp = 0;
649
            roundBits = zSig0 & roundMask;
650
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652
            zSig0 += roundIncrement;
653
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
654
            roundIncrement = roundMask + 1;
655
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656
                roundMask |= roundIncrement;
657
            }
658
            zSig0 &= ~ roundMask;
659
            return packFloatx80( zSign, zExp, zSig0 );
660
        }
661
    }
662
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663
    zSig0 += roundIncrement;
664
    if ( zSig0 < roundIncrement ) {
665
        ++zExp;
666
        zSig0 = LIT64( 0x8000000000000000 );
667
    }
668
    roundIncrement = roundMask + 1;
669
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670
        roundMask |= roundIncrement;
671
    }
672
    zSig0 &= ~ roundMask;
673
    if ( zSig0 == 0 ) zExp = 0;
674
    return packFloatx80( zSign, zExp, zSig0 );
675
 precision80:
676
    increment = ( (sbits64) zSig1 < 0 );
677
    if ( ! roundNearestEven ) {
678
        if ( roundingMode == float_round_to_zero ) {
679
            increment = 0;
680
        }
681
        else {
682
            if ( zSign ) {
683
                increment = ( roundingMode == float_round_down ) && zSig1;
684
            }
685
            else {
686
                increment = ( roundingMode == float_round_up ) && zSig1;
687
            }
688
        }
689
    }
690
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691
        if (    ( 0x7FFE < zExp )
692
             || (    ( zExp == 0x7FFE )
693
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694
                  && increment
695
                )
696
           ) {
697
            roundMask = 0;
698
 overflow:
699
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700
            if (    ( roundingMode == float_round_to_zero )
701
                 || ( zSign && ( roundingMode == float_round_up ) )
702
                 || ( ! zSign && ( roundingMode == float_round_down ) )
703
               ) {
704
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705
            }
706
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707
        }
708
        if ( zExp <= 0 ) {
709
            isTiny =
710
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711
                || ( zExp < 0 )
712
                || ! increment
713
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715
            zExp = 0;
716
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718
            if ( roundNearestEven ) {
719
                increment = ( (sbits64) zSig1 < 0 );
720
            }
721
            else {
722
                if ( zSign ) {
723
                    increment = ( roundingMode == float_round_down ) && zSig1;
724
                }
725
                else {
726
                    increment = ( roundingMode == float_round_up ) && zSig1;
727
                }
728
            }
729
            if ( increment ) {
730
                ++zSig0;
731
                zSig0 &=
732
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
734
            }
735
            return packFloatx80( zSign, zExp, zSig0 );
736
        }
737
    }
738
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739
    if ( increment ) {
740
        ++zSig0;
741
        if ( zSig0 == 0 ) {
742
            ++zExp;
743
            zSig0 = LIT64( 0x8000000000000000 );
744
        }
745
        else {
746
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747
        }
748
    }
749
    else {
750
        if ( zSig0 == 0 ) zExp = 0;
751
    }
752
    return packFloatx80( zSign, zExp, zSig0 );
753

    
754
}
755

    
756
/*----------------------------------------------------------------------------
757
| Takes an abstract floating-point value having sign `zSign', exponent
758
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759
| and returns the proper extended double-precision floating-point value
760
| corresponding to the abstract input.  This routine is just like
761
| `roundAndPackFloatx80' except that the input significand does not have to be
762
| normalized.
763
*----------------------------------------------------------------------------*/
764

    
765
static floatx80
766
 normalizeRoundAndPackFloatx80(
767
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768
 STATUS_PARAM)
769
{
770
    int8 shiftCount;
771

    
772
    if ( zSig0 == 0 ) {
773
        zSig0 = zSig1;
774
        zSig1 = 0;
775
        zExp -= 64;
776
    }
777
    shiftCount = countLeadingZeros64( zSig0 );
778
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779
    zExp -= shiftCount;
780
    return
781
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782

    
783
}
784

    
785
#endif
786

    
787
#ifdef FLOAT128
788

    
789
/*----------------------------------------------------------------------------
790
| Returns the least-significant 64 fraction bits of the quadruple-precision
791
| floating-point value `a'.
792
*----------------------------------------------------------------------------*/
793

    
794
INLINE bits64 extractFloat128Frac1( float128 a )
795
{
796

    
797
    return a.low;
798

    
799
}
800

    
801
/*----------------------------------------------------------------------------
802
| Returns the most-significant 48 fraction bits of the quadruple-precision
803
| floating-point value `a'.
804
*----------------------------------------------------------------------------*/
805

    
806
INLINE bits64 extractFloat128Frac0( float128 a )
807
{
808

    
809
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810

    
811
}
812

    
813
/*----------------------------------------------------------------------------
814
| Returns the exponent bits of the quadruple-precision floating-point value
815
| `a'.
816
*----------------------------------------------------------------------------*/
817

    
818
INLINE int32 extractFloat128Exp( float128 a )
819
{
820

    
821
    return ( a.high>>48 ) & 0x7FFF;
822

    
823
}
824

    
825
/*----------------------------------------------------------------------------
826
| Returns the sign bit of the quadruple-precision floating-point value `a'.
827
*----------------------------------------------------------------------------*/
828

    
829
INLINE flag extractFloat128Sign( float128 a )
830
{
831

    
832
    return a.high>>63;
833

    
834
}
835

    
836
/*----------------------------------------------------------------------------
837
| Normalizes the subnormal quadruple-precision floating-point value
838
| represented by the denormalized significand formed by the concatenation of
839
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
840
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
841
| significand are stored at the location pointed to by `zSig0Ptr', and the
842
| least significant 64 bits of the normalized significand are stored at the
843
| location pointed to by `zSig1Ptr'.
844
*----------------------------------------------------------------------------*/
845

    
846
static void
847
 normalizeFloat128Subnormal(
848
     bits64 aSig0,
849
     bits64 aSig1,
850
     int32 *zExpPtr,
851
     bits64 *zSig0Ptr,
852
     bits64 *zSig1Ptr
853
 )
854
{
855
    int8 shiftCount;
856

    
857
    if ( aSig0 == 0 ) {
858
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
859
        if ( shiftCount < 0 ) {
860
            *zSig0Ptr = aSig1>>( - shiftCount );
861
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
862
        }
863
        else {
864
            *zSig0Ptr = aSig1<<shiftCount;
865
            *zSig1Ptr = 0;
866
        }
867
        *zExpPtr = - shiftCount - 63;
868
    }
869
    else {
870
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
871
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872
        *zExpPtr = 1 - shiftCount;
873
    }
874

    
875
}
876

    
877
/*----------------------------------------------------------------------------
878
| Packs the sign `zSign', the exponent `zExp', and the significand formed
879
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880
| floating-point value, returning the result.  After being shifted into the
881
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882
| added together to form the most significant 32 bits of the result.  This
883
| means that any integer portion of `zSig0' will be added into the exponent.
884
| Since a properly normalized significand will have an integer portion equal
885
| to 1, the `zExp' input should be 1 less than the desired result exponent
886
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887
| significand.
888
*----------------------------------------------------------------------------*/
889

    
890
INLINE float128
891
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892
{
893
    float128 z;
894

    
895
    z.low = zSig1;
896
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897
    return z;
898

    
899
}
900

    
901
/*----------------------------------------------------------------------------
902
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903
| and extended significand formed by the concatenation of `zSig0', `zSig1',
904
| and `zSig2', and returns the proper quadruple-precision floating-point value
905
| corresponding to the abstract input.  Ordinarily, the abstract value is
906
| simply rounded and packed into the quadruple-precision format, with the
907
| inexact exception raised if the abstract input cannot be represented
908
| exactly.  However, if the abstract value is too large, the overflow and
909
| inexact exceptions are raised and an infinity or maximal finite value is
910
| returned.  If the abstract value is too small, the input value is rounded to
911
| a subnormal number, and the underflow and inexact exceptions are raised if
912
| the abstract input cannot be represented exactly as a subnormal quadruple-
913
| precision floating-point number.
914
|     The input significand must be normalized or smaller.  If the input
915
| significand is not normalized, `zExp' must be 0; in that case, the result
916
| returned is a subnormal number, and it must not require rounding.  In the
917
| usual case that the input significand is normalized, `zExp' must be 1 less
918
| than the ``true'' floating-point exponent.  The handling of underflow and
919
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920
*----------------------------------------------------------------------------*/
921

    
922
static float128
923
 roundAndPackFloat128(
924
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925
{
926
    int8 roundingMode;
927
    flag roundNearestEven, increment, isTiny;
928

    
929
    roundingMode = STATUS(float_rounding_mode);
930
    roundNearestEven = ( roundingMode == float_round_nearest_even );
931
    increment = ( (sbits64) zSig2 < 0 );
932
    if ( ! roundNearestEven ) {
933
        if ( roundingMode == float_round_to_zero ) {
934
            increment = 0;
935
        }
936
        else {
937
            if ( zSign ) {
938
                increment = ( roundingMode == float_round_down ) && zSig2;
939
            }
940
            else {
941
                increment = ( roundingMode == float_round_up ) && zSig2;
942
            }
943
        }
944
    }
945
    if ( 0x7FFD <= (bits32) zExp ) {
946
        if (    ( 0x7FFD < zExp )
947
             || (    ( zExp == 0x7FFD )
948
                  && eq128(
949
                         LIT64( 0x0001FFFFFFFFFFFF ),
950
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
951
                         zSig0,
952
                         zSig1
953
                     )
954
                  && increment
955
                )
956
           ) {
957
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958
            if (    ( roundingMode == float_round_to_zero )
959
                 || ( zSign && ( roundingMode == float_round_up ) )
960
                 || ( ! zSign && ( roundingMode == float_round_down ) )
961
               ) {
962
                return
963
                    packFloat128(
964
                        zSign,
965
                        0x7FFE,
966
                        LIT64( 0x0000FFFFFFFFFFFF ),
967
                        LIT64( 0xFFFFFFFFFFFFFFFF )
968
                    );
969
            }
970
            return packFloat128( zSign, 0x7FFF, 0, 0 );
971
        }
972
        if ( zExp < 0 ) {
973
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974
            isTiny =
975
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976
                || ( zExp < -1 )
977
                || ! increment
978
                || lt128(
979
                       zSig0,
980
                       zSig1,
981
                       LIT64( 0x0001FFFFFFFFFFFF ),
982
                       LIT64( 0xFFFFFFFFFFFFFFFF )
983
                   );
984
            shift128ExtraRightJamming(
985
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986
            zExp = 0;
987
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988
            if ( roundNearestEven ) {
989
                increment = ( (sbits64) zSig2 < 0 );
990
            }
991
            else {
992
                if ( zSign ) {
993
                    increment = ( roundingMode == float_round_down ) && zSig2;
994
                }
995
                else {
996
                    increment = ( roundingMode == float_round_up ) && zSig2;
997
                }
998
            }
999
        }
1000
    }
1001
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002
    if ( increment ) {
1003
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005
    }
1006
    else {
1007
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008
    }
1009
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1010

    
1011
}
1012

    
1013
/*----------------------------------------------------------------------------
1014
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016
| returns the proper quadruple-precision floating-point value corresponding
1017
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1018
| except that the input significand has fewer bits and does not have to be
1019
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020
| point exponent.
1021
*----------------------------------------------------------------------------*/
1022

    
1023
static float128
1024
 normalizeRoundAndPackFloat128(
1025
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026
{
1027
    int8 shiftCount;
1028
    bits64 zSig2;
1029

    
1030
    if ( zSig0 == 0 ) {
1031
        zSig0 = zSig1;
1032
        zSig1 = 0;
1033
        zExp -= 64;
1034
    }
1035
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036
    if ( 0 <= shiftCount ) {
1037
        zSig2 = 0;
1038
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039
    }
1040
    else {
1041
        shift128ExtraRightJamming(
1042
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043
    }
1044
    zExp -= shiftCount;
1045
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046

    
1047
}
1048

    
1049
#endif
1050

    
1051
/*----------------------------------------------------------------------------
1052
| Returns the result of converting the 32-bit two's complement integer `a'
1053
| to the single-precision floating-point format.  The conversion is performed
1054
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055
*----------------------------------------------------------------------------*/
1056

    
1057
float32 int32_to_float32( int32 a STATUS_PARAM )
1058
{
1059
    flag zSign;
1060

    
1061
    if ( a == 0 ) return float32_zero;
1062
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063
    zSign = ( a < 0 );
1064
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065

    
1066
}
1067

    
1068
/*----------------------------------------------------------------------------
1069
| Returns the result of converting the 32-bit two's complement integer `a'
1070
| to the double-precision floating-point format.  The conversion is performed
1071
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072
*----------------------------------------------------------------------------*/
1073

    
1074
float64 int32_to_float64( int32 a STATUS_PARAM )
1075
{
1076
    flag zSign;
1077
    uint32 absA;
1078
    int8 shiftCount;
1079
    bits64 zSig;
1080

    
1081
    if ( a == 0 ) return float64_zero;
1082
    zSign = ( a < 0 );
1083
    absA = zSign ? - a : a;
1084
    shiftCount = countLeadingZeros32( absA ) + 21;
1085
    zSig = absA;
1086
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087

    
1088
}
1089

    
1090
#ifdef FLOATX80
1091

    
1092
/*----------------------------------------------------------------------------
1093
| Returns the result of converting the 32-bit two's complement integer `a'
1094
| to the extended double-precision floating-point format.  The conversion
1095
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096
| Arithmetic.
1097
*----------------------------------------------------------------------------*/
1098

    
1099
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100
{
1101
    flag zSign;
1102
    uint32 absA;
1103
    int8 shiftCount;
1104
    bits64 zSig;
1105

    
1106
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107
    zSign = ( a < 0 );
1108
    absA = zSign ? - a : a;
1109
    shiftCount = countLeadingZeros32( absA ) + 32;
1110
    zSig = absA;
1111
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112

    
1113
}
1114

    
1115
#endif
1116

    
1117
#ifdef FLOAT128
1118

    
1119
/*----------------------------------------------------------------------------
1120
| Returns the result of converting the 32-bit two's complement integer `a' to
1121
| the quadruple-precision floating-point format.  The conversion is performed
1122
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123
*----------------------------------------------------------------------------*/
1124

    
1125
float128 int32_to_float128( int32 a STATUS_PARAM )
1126
{
1127
    flag zSign;
1128
    uint32 absA;
1129
    int8 shiftCount;
1130
    bits64 zSig0;
1131

    
1132
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133
    zSign = ( a < 0 );
1134
    absA = zSign ? - a : a;
1135
    shiftCount = countLeadingZeros32( absA ) + 17;
1136
    zSig0 = absA;
1137
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138

    
1139
}
1140

    
1141
#endif
1142

    
1143
/*----------------------------------------------------------------------------
1144
| Returns the result of converting the 64-bit two's complement integer `a'
1145
| to the single-precision floating-point format.  The conversion is performed
1146
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147
*----------------------------------------------------------------------------*/
1148

    
1149
float32 int64_to_float32( int64 a STATUS_PARAM )
1150
{
1151
    flag zSign;
1152
    uint64 absA;
1153
    int8 shiftCount;
1154

    
1155
    if ( a == 0 ) return float32_zero;
1156
    zSign = ( a < 0 );
1157
    absA = zSign ? - a : a;
1158
    shiftCount = countLeadingZeros64( absA ) - 40;
1159
    if ( 0 <= shiftCount ) {
1160
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161
    }
1162
    else {
1163
        shiftCount += 7;
1164
        if ( shiftCount < 0 ) {
1165
            shift64RightJamming( absA, - shiftCount, &absA );
1166
        }
1167
        else {
1168
            absA <<= shiftCount;
1169
        }
1170
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171
    }
1172

    
1173
}
1174

    
1175
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176
{
1177
    int8 shiftCount;
1178

    
1179
    if ( a == 0 ) return float32_zero;
1180
    shiftCount = countLeadingZeros64( a ) - 40;
1181
    if ( 0 <= shiftCount ) {
1182
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183
    }
1184
    else {
1185
        shiftCount += 7;
1186
        if ( shiftCount < 0 ) {
1187
            shift64RightJamming( a, - shiftCount, &a );
1188
        }
1189
        else {
1190
            a <<= shiftCount;
1191
        }
1192
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193
    }
1194
}
1195

    
1196
/*----------------------------------------------------------------------------
1197
| Returns the result of converting the 64-bit two's complement integer `a'
1198
| to the double-precision floating-point format.  The conversion is performed
1199
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200
*----------------------------------------------------------------------------*/
1201

    
1202
float64 int64_to_float64( int64 a STATUS_PARAM )
1203
{
1204
    flag zSign;
1205

    
1206
    if ( a == 0 ) return float64_zero;
1207
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208
        return packFloat64( 1, 0x43E, 0 );
1209
    }
1210
    zSign = ( a < 0 );
1211
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212

    
1213
}
1214

    
1215
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216
{
1217
    if ( a == 0 ) return float64_zero;
1218
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219

    
1220
}
1221

    
1222
#ifdef FLOATX80
1223

    
1224
/*----------------------------------------------------------------------------
1225
| Returns the result of converting the 64-bit two's complement integer `a'
1226
| to the extended double-precision floating-point format.  The conversion
1227
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228
| Arithmetic.
1229
*----------------------------------------------------------------------------*/
1230

    
1231
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232
{
1233
    flag zSign;
1234
    uint64 absA;
1235
    int8 shiftCount;
1236

    
1237
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238
    zSign = ( a < 0 );
1239
    absA = zSign ? - a : a;
1240
    shiftCount = countLeadingZeros64( absA );
1241
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242

    
1243
}
1244

    
1245
#endif
1246

    
1247
#ifdef FLOAT128
1248

    
1249
/*----------------------------------------------------------------------------
1250
| Returns the result of converting the 64-bit two's complement integer `a' to
1251
| the quadruple-precision floating-point format.  The conversion is performed
1252
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253
*----------------------------------------------------------------------------*/
1254

    
1255
float128 int64_to_float128( int64 a STATUS_PARAM )
1256
{
1257
    flag zSign;
1258
    uint64 absA;
1259
    int8 shiftCount;
1260
    int32 zExp;
1261
    bits64 zSig0, zSig1;
1262

    
1263
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264
    zSign = ( a < 0 );
1265
    absA = zSign ? - a : a;
1266
    shiftCount = countLeadingZeros64( absA ) + 49;
1267
    zExp = 0x406E - shiftCount;
1268
    if ( 64 <= shiftCount ) {
1269
        zSig1 = 0;
1270
        zSig0 = absA;
1271
        shiftCount -= 64;
1272
    }
1273
    else {
1274
        zSig1 = absA;
1275
        zSig0 = 0;
1276
    }
1277
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1279

    
1280
}
1281

    
1282
#endif
1283

    
1284
/*----------------------------------------------------------------------------
1285
| Returns the result of converting the single-precision floating-point value
1286
| `a' to the 32-bit two's complement integer format.  The conversion is
1287
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288
| Arithmetic---which means in particular that the conversion is rounded
1289
| according to the current rounding mode.  If `a' is a NaN, the largest
1290
| positive integer is returned.  Otherwise, if the conversion overflows, the
1291
| largest integer with the same sign as `a' is returned.
1292
*----------------------------------------------------------------------------*/
1293

    
1294
int32 float32_to_int32( float32 a STATUS_PARAM )
1295
{
1296
    flag aSign;
1297
    int16 aExp, shiftCount;
1298
    bits32 aSig;
1299
    bits64 aSig64;
1300

    
1301
    aSig = extractFloat32Frac( a );
1302
    aExp = extractFloat32Exp( a );
1303
    aSign = extractFloat32Sign( a );
1304
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305
    if ( aExp ) aSig |= 0x00800000;
1306
    shiftCount = 0xAF - aExp;
1307
    aSig64 = aSig;
1308
    aSig64 <<= 32;
1309
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311

    
1312
}
1313

    
1314
/*----------------------------------------------------------------------------
1315
| Returns the result of converting the single-precision floating-point value
1316
| `a' to the 32-bit two's complement integer format.  The conversion is
1317
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318
| Arithmetic, except that the conversion is always rounded toward zero.
1319
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320
| the conversion overflows, the largest integer with the same sign as `a' is
1321
| returned.
1322
*----------------------------------------------------------------------------*/
1323

    
1324
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325
{
1326
    flag aSign;
1327
    int16 aExp, shiftCount;
1328
    bits32 aSig;
1329
    int32 z;
1330

    
1331
    aSig = extractFloat32Frac( a );
1332
    aExp = extractFloat32Exp( a );
1333
    aSign = extractFloat32Sign( a );
1334
    shiftCount = aExp - 0x9E;
1335
    if ( 0 <= shiftCount ) {
1336
        if ( float32_val(a) != 0xCF000000 ) {
1337
            float_raise( float_flag_invalid STATUS_VAR);
1338
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339
        }
1340
        return (sbits32) 0x80000000;
1341
    }
1342
    else if ( aExp <= 0x7E ) {
1343
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344
        return 0;
1345
    }
1346
    aSig = ( aSig | 0x00800000 )<<8;
1347
    z = aSig>>( - shiftCount );
1348
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349
        STATUS(float_exception_flags) |= float_flag_inexact;
1350
    }
1351
    if ( aSign ) z = - z;
1352
    return z;
1353

    
1354
}
1355

    
1356
/*----------------------------------------------------------------------------
1357
| Returns the result of converting the single-precision floating-point value
1358
| `a' to the 64-bit two's complement integer format.  The conversion is
1359
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360
| Arithmetic---which means in particular that the conversion is rounded
1361
| according to the current rounding mode.  If `a' is a NaN, the largest
1362
| positive integer is returned.  Otherwise, if the conversion overflows, the
1363
| largest integer with the same sign as `a' is returned.
1364
*----------------------------------------------------------------------------*/
1365

    
1366
int64 float32_to_int64( float32 a STATUS_PARAM )
1367
{
1368
    flag aSign;
1369
    int16 aExp, shiftCount;
1370
    bits32 aSig;
1371
    bits64 aSig64, aSigExtra;
1372

    
1373
    aSig = extractFloat32Frac( a );
1374
    aExp = extractFloat32Exp( a );
1375
    aSign = extractFloat32Sign( a );
1376
    shiftCount = 0xBE - aExp;
1377
    if ( shiftCount < 0 ) {
1378
        float_raise( float_flag_invalid STATUS_VAR);
1379
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1381
        }
1382
        return (sbits64) LIT64( 0x8000000000000000 );
1383
    }
1384
    if ( aExp ) aSig |= 0x00800000;
1385
    aSig64 = aSig;
1386
    aSig64 <<= 40;
1387
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389

    
1390
}
1391

    
1392
/*----------------------------------------------------------------------------
1393
| Returns the result of converting the single-precision floating-point value
1394
| `a' to the 64-bit two's complement integer format.  The conversion is
1395
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396
| Arithmetic, except that the conversion is always rounded toward zero.  If
1397
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398
| conversion overflows, the largest integer with the same sign as `a' is
1399
| returned.
1400
*----------------------------------------------------------------------------*/
1401

    
1402
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403
{
1404
    flag aSign;
1405
    int16 aExp, shiftCount;
1406
    bits32 aSig;
1407
    bits64 aSig64;
1408
    int64 z;
1409

    
1410
    aSig = extractFloat32Frac( a );
1411
    aExp = extractFloat32Exp( a );
1412
    aSign = extractFloat32Sign( a );
1413
    shiftCount = aExp - 0xBE;
1414
    if ( 0 <= shiftCount ) {
1415
        if ( float32_val(a) != 0xDF000000 ) {
1416
            float_raise( float_flag_invalid STATUS_VAR);
1417
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1419
            }
1420
        }
1421
        return (sbits64) LIT64( 0x8000000000000000 );
1422
    }
1423
    else if ( aExp <= 0x7E ) {
1424
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425
        return 0;
1426
    }
1427
    aSig64 = aSig | 0x00800000;
1428
    aSig64 <<= 40;
1429
    z = aSig64>>( - shiftCount );
1430
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431
        STATUS(float_exception_flags) |= float_flag_inexact;
1432
    }
1433
    if ( aSign ) z = - z;
1434
    return z;
1435

    
1436
}
1437

    
1438
/*----------------------------------------------------------------------------
1439
| Returns the result of converting the single-precision floating-point value
1440
| `a' to the double-precision floating-point format.  The conversion is
1441
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442
| Arithmetic.
1443
*----------------------------------------------------------------------------*/
1444

    
1445
float64 float32_to_float64( float32 a STATUS_PARAM )
1446
{
1447
    flag aSign;
1448
    int16 aExp;
1449
    bits32 aSig;
1450

    
1451
    aSig = extractFloat32Frac( a );
1452
    aExp = extractFloat32Exp( a );
1453
    aSign = extractFloat32Sign( a );
1454
    if ( aExp == 0xFF ) {
1455
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456
        return packFloat64( aSign, 0x7FF, 0 );
1457
    }
1458
    if ( aExp == 0 ) {
1459
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461
        --aExp;
1462
    }
1463
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464

    
1465
}
1466

    
1467
#ifdef FLOATX80
1468

    
1469
/*----------------------------------------------------------------------------
1470
| Returns the result of converting the single-precision floating-point value
1471
| `a' to the extended double-precision floating-point format.  The conversion
1472
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473
| Arithmetic.
1474
*----------------------------------------------------------------------------*/
1475

    
1476
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477
{
1478
    flag aSign;
1479
    int16 aExp;
1480
    bits32 aSig;
1481

    
1482
    aSig = extractFloat32Frac( a );
1483
    aExp = extractFloat32Exp( a );
1484
    aSign = extractFloat32Sign( a );
1485
    if ( aExp == 0xFF ) {
1486
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488
    }
1489
    if ( aExp == 0 ) {
1490
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492
    }
1493
    aSig |= 0x00800000;
1494
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495

    
1496
}
1497

    
1498
#endif
1499

    
1500
#ifdef FLOAT128
1501

    
1502
/*----------------------------------------------------------------------------
1503
| Returns the result of converting the single-precision floating-point value
1504
| `a' to the double-precision floating-point format.  The conversion is
1505
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506
| Arithmetic.
1507
*----------------------------------------------------------------------------*/
1508

    
1509
float128 float32_to_float128( float32 a STATUS_PARAM )
1510
{
1511
    flag aSign;
1512
    int16 aExp;
1513
    bits32 aSig;
1514

    
1515
    aSig = extractFloat32Frac( a );
1516
    aExp = extractFloat32Exp( a );
1517
    aSign = extractFloat32Sign( a );
1518
    if ( aExp == 0xFF ) {
1519
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1521
    }
1522
    if ( aExp == 0 ) {
1523
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525
        --aExp;
1526
    }
1527
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528

    
1529
}
1530

    
1531
#endif
1532

    
1533
/*----------------------------------------------------------------------------
1534
| Rounds the single-precision floating-point value `a' to an integer, and
1535
| returns the result as a single-precision floating-point value.  The
1536
| operation is performed according to the IEC/IEEE Standard for Binary
1537
| Floating-Point Arithmetic.
1538
*----------------------------------------------------------------------------*/
1539

    
1540
float32 float32_round_to_int( float32 a STATUS_PARAM)
1541
{
1542
    flag aSign;
1543
    int16 aExp;
1544
    bits32 lastBitMask, roundBitsMask;
1545
    int8 roundingMode;
1546
    bits32 z;
1547

    
1548
    aExp = extractFloat32Exp( a );
1549
    if ( 0x96 <= aExp ) {
1550
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551
            return propagateFloat32NaN( a, a STATUS_VAR );
1552
        }
1553
        return a;
1554
    }
1555
    if ( aExp <= 0x7E ) {
1556
        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557
        STATUS(float_exception_flags) |= float_flag_inexact;
1558
        aSign = extractFloat32Sign( a );
1559
        switch ( STATUS(float_rounding_mode) ) {
1560
         case float_round_nearest_even:
1561
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562
                return packFloat32( aSign, 0x7F, 0 );
1563
            }
1564
            break;
1565
         case float_round_down:
1566
            return make_float32(aSign ? 0xBF800000 : 0);
1567
         case float_round_up:
1568
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569
        }
1570
        return packFloat32( aSign, 0, 0 );
1571
    }
1572
    lastBitMask = 1;
1573
    lastBitMask <<= 0x96 - aExp;
1574
    roundBitsMask = lastBitMask - 1;
1575
    z = float32_val(a);
1576
    roundingMode = STATUS(float_rounding_mode);
1577
    if ( roundingMode == float_round_nearest_even ) {
1578
        z += lastBitMask>>1;
1579
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580
    }
1581
    else if ( roundingMode != float_round_to_zero ) {
1582
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583
            z += roundBitsMask;
1584
        }
1585
    }
1586
    z &= ~ roundBitsMask;
1587
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588
    return make_float32(z);
1589

    
1590
}
1591

    
1592
/*----------------------------------------------------------------------------
1593
| Returns the result of adding the absolute values of the single-precision
1594
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595
| before being returned.  `zSign' is ignored if the result is a NaN.
1596
| The addition is performed according to the IEC/IEEE Standard for Binary
1597
| Floating-Point Arithmetic.
1598
*----------------------------------------------------------------------------*/
1599

    
1600
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601
{
1602
    int16 aExp, bExp, zExp;
1603
    bits32 aSig, bSig, zSig;
1604
    int16 expDiff;
1605

    
1606
    aSig = extractFloat32Frac( a );
1607
    aExp = extractFloat32Exp( a );
1608
    bSig = extractFloat32Frac( b );
1609
    bExp = extractFloat32Exp( b );
1610
    expDiff = aExp - bExp;
1611
    aSig <<= 6;
1612
    bSig <<= 6;
1613
    if ( 0 < expDiff ) {
1614
        if ( aExp == 0xFF ) {
1615
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616
            return a;
1617
        }
1618
        if ( bExp == 0 ) {
1619
            --expDiff;
1620
        }
1621
        else {
1622
            bSig |= 0x20000000;
1623
        }
1624
        shift32RightJamming( bSig, expDiff, &bSig );
1625
        zExp = aExp;
1626
    }
1627
    else if ( expDiff < 0 ) {
1628
        if ( bExp == 0xFF ) {
1629
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630
            return packFloat32( zSign, 0xFF, 0 );
1631
        }
1632
        if ( aExp == 0 ) {
1633
            ++expDiff;
1634
        }
1635
        else {
1636
            aSig |= 0x20000000;
1637
        }
1638
        shift32RightJamming( aSig, - expDiff, &aSig );
1639
        zExp = bExp;
1640
    }
1641
    else {
1642
        if ( aExp == 0xFF ) {
1643
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644
            return a;
1645
        }
1646
        if ( aExp == 0 ) {
1647
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649
        }
1650
        zSig = 0x40000000 + aSig + bSig;
1651
        zExp = aExp;
1652
        goto roundAndPack;
1653
    }
1654
    aSig |= 0x20000000;
1655
    zSig = ( aSig + bSig )<<1;
1656
    --zExp;
1657
    if ( (sbits32) zSig < 0 ) {
1658
        zSig = aSig + bSig;
1659
        ++zExp;
1660
    }
1661
 roundAndPack:
1662
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663

    
1664
}
1665

    
1666
/*----------------------------------------------------------------------------
1667
| Returns the result of subtracting the absolute values of the single-
1668
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669
| difference is negated before being returned.  `zSign' is ignored if the
1670
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671
| Standard for Binary Floating-Point Arithmetic.
1672
*----------------------------------------------------------------------------*/
1673

    
1674
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675
{
1676
    int16 aExp, bExp, zExp;
1677
    bits32 aSig, bSig, zSig;
1678
    int16 expDiff;
1679

    
1680
    aSig = extractFloat32Frac( a );
1681
    aExp = extractFloat32Exp( a );
1682
    bSig = extractFloat32Frac( b );
1683
    bExp = extractFloat32Exp( b );
1684
    expDiff = aExp - bExp;
1685
    aSig <<= 7;
1686
    bSig <<= 7;
1687
    if ( 0 < expDiff ) goto aExpBigger;
1688
    if ( expDiff < 0 ) goto bExpBigger;
1689
    if ( aExp == 0xFF ) {
1690
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691
        float_raise( float_flag_invalid STATUS_VAR);
1692
        return float32_default_nan;
1693
    }
1694
    if ( aExp == 0 ) {
1695
        aExp = 1;
1696
        bExp = 1;
1697
    }
1698
    if ( bSig < aSig ) goto aBigger;
1699
    if ( aSig < bSig ) goto bBigger;
1700
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701
 bExpBigger:
1702
    if ( bExp == 0xFF ) {
1703
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1705
    }
1706
    if ( aExp == 0 ) {
1707
        ++expDiff;
1708
    }
1709
    else {
1710
        aSig |= 0x40000000;
1711
    }
1712
    shift32RightJamming( aSig, - expDiff, &aSig );
1713
    bSig |= 0x40000000;
1714
 bBigger:
1715
    zSig = bSig - aSig;
1716
    zExp = bExp;
1717
    zSign ^= 1;
1718
    goto normalizeRoundAndPack;
1719
 aExpBigger:
1720
    if ( aExp == 0xFF ) {
1721
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722
        return a;
1723
    }
1724
    if ( bExp == 0 ) {
1725
        --expDiff;
1726
    }
1727
    else {
1728
        bSig |= 0x40000000;
1729
    }
1730
    shift32RightJamming( bSig, expDiff, &bSig );
1731
    aSig |= 0x40000000;
1732
 aBigger:
1733
    zSig = aSig - bSig;
1734
    zExp = aExp;
1735
 normalizeRoundAndPack:
1736
    --zExp;
1737
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738

    
1739
}
1740

    
1741
/*----------------------------------------------------------------------------
1742
| Returns the result of adding the single-precision floating-point values `a'
1743
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744
| Binary Floating-Point Arithmetic.
1745
*----------------------------------------------------------------------------*/
1746

    
1747
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748
{
1749
    flag aSign, bSign;
1750

    
1751
    aSign = extractFloat32Sign( a );
1752
    bSign = extractFloat32Sign( b );
1753
    if ( aSign == bSign ) {
1754
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755
    }
1756
    else {
1757
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758
    }
1759

    
1760
}
1761

    
1762
/*----------------------------------------------------------------------------
1763
| Returns the result of subtracting the single-precision floating-point values
1764
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765
| for Binary Floating-Point Arithmetic.
1766
*----------------------------------------------------------------------------*/
1767

    
1768
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769
{
1770
    flag aSign, bSign;
1771

    
1772
    aSign = extractFloat32Sign( a );
1773
    bSign = extractFloat32Sign( b );
1774
    if ( aSign == bSign ) {
1775
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776
    }
1777
    else {
1778
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779
    }
1780

    
1781
}
1782

    
1783
/*----------------------------------------------------------------------------
1784
| Returns the result of multiplying the single-precision floating-point values
1785
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786
| for Binary Floating-Point Arithmetic.
1787
*----------------------------------------------------------------------------*/
1788

    
1789
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790
{
1791
    flag aSign, bSign, zSign;
1792
    int16 aExp, bExp, zExp;
1793
    bits32 aSig, bSig;
1794
    bits64 zSig64;
1795
    bits32 zSig;
1796

    
1797
    aSig = extractFloat32Frac( a );
1798
    aExp = extractFloat32Exp( a );
1799
    aSign = extractFloat32Sign( a );
1800
    bSig = extractFloat32Frac( b );
1801
    bExp = extractFloat32Exp( b );
1802
    bSign = extractFloat32Sign( b );
1803
    zSign = aSign ^ bSign;
1804
    if ( aExp == 0xFF ) {
1805
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806
            return propagateFloat32NaN( a, b STATUS_VAR );
1807
        }
1808
        if ( ( bExp | bSig ) == 0 ) {
1809
            float_raise( float_flag_invalid STATUS_VAR);
1810
            return float32_default_nan;
1811
        }
1812
        return packFloat32( zSign, 0xFF, 0 );
1813
    }
1814
    if ( bExp == 0xFF ) {
1815
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816
        if ( ( aExp | aSig ) == 0 ) {
1817
            float_raise( float_flag_invalid STATUS_VAR);
1818
            return float32_default_nan;
1819
        }
1820
        return packFloat32( zSign, 0xFF, 0 );
1821
    }
1822
    if ( aExp == 0 ) {
1823
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825
    }
1826
    if ( bExp == 0 ) {
1827
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829
    }
1830
    zExp = aExp + bExp - 0x7F;
1831
    aSig = ( aSig | 0x00800000 )<<7;
1832
    bSig = ( bSig | 0x00800000 )<<8;
1833
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834
    zSig = zSig64;
1835
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836
        zSig <<= 1;
1837
        --zExp;
1838
    }
1839
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840

    
1841
}
1842

    
1843
/*----------------------------------------------------------------------------
1844
| Returns the result of dividing the single-precision floating-point value `a'
1845
| by the corresponding value `b'.  The operation is performed according to the
1846
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847
*----------------------------------------------------------------------------*/
1848

    
1849
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850
{
1851
    flag aSign, bSign, zSign;
1852
    int16 aExp, bExp, zExp;
1853
    bits32 aSig, bSig, zSig;
1854

    
1855
    aSig = extractFloat32Frac( a );
1856
    aExp = extractFloat32Exp( a );
1857
    aSign = extractFloat32Sign( a );
1858
    bSig = extractFloat32Frac( b );
1859
    bExp = extractFloat32Exp( b );
1860
    bSign = extractFloat32Sign( b );
1861
    zSign = aSign ^ bSign;
1862
    if ( aExp == 0xFF ) {
1863
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864
        if ( bExp == 0xFF ) {
1865
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866
            float_raise( float_flag_invalid STATUS_VAR);
1867
            return float32_default_nan;
1868
        }
1869
        return packFloat32( zSign, 0xFF, 0 );
1870
    }
1871
    if ( bExp == 0xFF ) {
1872
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873
        return packFloat32( zSign, 0, 0 );
1874
    }
1875
    if ( bExp == 0 ) {
1876
        if ( bSig == 0 ) {
1877
            if ( ( aExp | aSig ) == 0 ) {
1878
                float_raise( float_flag_invalid STATUS_VAR);
1879
                return float32_default_nan;
1880
            }
1881
            float_raise( float_flag_divbyzero STATUS_VAR);
1882
            return packFloat32( zSign, 0xFF, 0 );
1883
        }
1884
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885
    }
1886
    if ( aExp == 0 ) {
1887
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889
    }
1890
    zExp = aExp - bExp + 0x7D;
1891
    aSig = ( aSig | 0x00800000 )<<7;
1892
    bSig = ( bSig | 0x00800000 )<<8;
1893
    if ( bSig <= ( aSig + aSig ) ) {
1894
        aSig >>= 1;
1895
        ++zExp;
1896
    }
1897
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898
    if ( ( zSig & 0x3F ) == 0 ) {
1899
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900
    }
1901
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902

    
1903
}
1904

    
1905
/*----------------------------------------------------------------------------
1906
| Returns the remainder of the single-precision floating-point value `a'
1907
| with respect to the corresponding value `b'.  The operation is performed
1908
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909
*----------------------------------------------------------------------------*/
1910

    
1911
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912
{
1913
    flag aSign, zSign;
1914
    int16 aExp, bExp, expDiff;
1915
    bits32 aSig, bSig;
1916
    bits32 q;
1917
    bits64 aSig64, bSig64, q64;
1918
    bits32 alternateASig;
1919
    sbits32 sigMean;
1920

    
1921
    aSig = extractFloat32Frac( a );
1922
    aExp = extractFloat32Exp( a );
1923
    aSign = extractFloat32Sign( a );
1924
    bSig = extractFloat32Frac( b );
1925
    bExp = extractFloat32Exp( b );
1926
    if ( aExp == 0xFF ) {
1927
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1928
            return propagateFloat32NaN( a, b STATUS_VAR );
1929
        }
1930
        float_raise( float_flag_invalid STATUS_VAR);
1931
        return float32_default_nan;
1932
    }
1933
    if ( bExp == 0xFF ) {
1934
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935
        return a;
1936
    }
1937
    if ( bExp == 0 ) {
1938
        if ( bSig == 0 ) {
1939
            float_raise( float_flag_invalid STATUS_VAR);
1940
            return float32_default_nan;
1941
        }
1942
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943
    }
1944
    if ( aExp == 0 ) {
1945
        if ( aSig == 0 ) return a;
1946
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1947
    }
1948
    expDiff = aExp - bExp;
1949
    aSig |= 0x00800000;
1950
    bSig |= 0x00800000;
1951
    if ( expDiff < 32 ) {
1952
        aSig <<= 8;
1953
        bSig <<= 8;
1954
        if ( expDiff < 0 ) {
1955
            if ( expDiff < -1 ) return a;
1956
            aSig >>= 1;
1957
        }
1958
        q = ( bSig <= aSig );
1959
        if ( q ) aSig -= bSig;
1960
        if ( 0 < expDiff ) {
1961
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1962
            q >>= 32 - expDiff;
1963
            bSig >>= 2;
1964
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1965
        }
1966
        else {
1967
            aSig >>= 2;
1968
            bSig >>= 2;
1969
        }
1970
    }
1971
    else {
1972
        if ( bSig <= aSig ) aSig -= bSig;
1973
        aSig64 = ( (bits64) aSig )<<40;
1974
        bSig64 = ( (bits64) bSig )<<40;
1975
        expDiff -= 64;
1976
        while ( 0 < expDiff ) {
1977
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979
            aSig64 = - ( ( bSig * q64 )<<38 );
1980
            expDiff -= 62;
1981
        }
1982
        expDiff += 64;
1983
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985
        q = q64>>( 64 - expDiff );
1986
        bSig <<= 6;
1987
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1988
    }
1989
    do {
1990
        alternateASig = aSig;
1991
        ++q;
1992
        aSig -= bSig;
1993
    } while ( 0 <= (sbits32) aSig );
1994
    sigMean = aSig + alternateASig;
1995
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996
        aSig = alternateASig;
1997
    }
1998
    zSign = ( (sbits32) aSig < 0 );
1999
    if ( zSign ) aSig = - aSig;
2000
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2001

    
2002
}
2003

    
2004
/*----------------------------------------------------------------------------
2005
| Returns the square root of the single-precision floating-point value `a'.
2006
| The operation is performed according to the IEC/IEEE Standard for Binary
2007
| Floating-Point Arithmetic.
2008
*----------------------------------------------------------------------------*/
2009

    
2010
float32 float32_sqrt( float32 a STATUS_PARAM )
2011
{
2012
    flag aSign;
2013
    int16 aExp, zExp;
2014
    bits32 aSig, zSig;
2015
    bits64 rem, term;
2016

    
2017
    aSig = extractFloat32Frac( a );
2018
    aExp = extractFloat32Exp( a );
2019
    aSign = extractFloat32Sign( a );
2020
    if ( aExp == 0xFF ) {
2021
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022
        if ( ! aSign ) return a;
2023
        float_raise( float_flag_invalid STATUS_VAR);
2024
        return float32_default_nan;
2025
    }
2026
    if ( aSign ) {
2027
        if ( ( aExp | aSig ) == 0 ) return a;
2028
        float_raise( float_flag_invalid STATUS_VAR);
2029
        return float32_default_nan;
2030
    }
2031
    if ( aExp == 0 ) {
2032
        if ( aSig == 0 ) return float32_zero;
2033
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2034
    }
2035
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036
    aSig = ( aSig | 0x00800000 )<<8;
2037
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2038
    if ( ( zSig & 0x7F ) <= 5 ) {
2039
        if ( zSig < 2 ) {
2040
            zSig = 0x7FFFFFFF;
2041
            goto roundAndPack;
2042
        }
2043
        aSig >>= aExp & 1;
2044
        term = ( (bits64) zSig ) * zSig;
2045
        rem = ( ( (bits64) aSig )<<32 ) - term;
2046
        while ( (sbits64) rem < 0 ) {
2047
            --zSig;
2048
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2049
        }
2050
        zSig |= ( rem != 0 );
2051
    }
2052
    shift32RightJamming( zSig, 1, &zSig );
2053
 roundAndPack:
2054
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2055

    
2056
}
2057

    
2058
/*----------------------------------------------------------------------------
2059
| Returns the binary log of the single-precision floating-point value `a'.
2060
| The operation is performed according to the IEC/IEEE Standard for Binary
2061
| Floating-Point Arithmetic.
2062
*----------------------------------------------------------------------------*/
2063
float32 float32_log2( float32 a STATUS_PARAM )
2064
{
2065
    flag aSign, zSign;
2066
    int16 aExp;
2067
    bits32 aSig, zSig, i;
2068

    
2069
    aSig = extractFloat32Frac( a );
2070
    aExp = extractFloat32Exp( a );
2071
    aSign = extractFloat32Sign( a );
2072

    
2073
    if ( aExp == 0 ) {
2074
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2075
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2076
    }
2077
    if ( aSign ) {
2078
        float_raise( float_flag_invalid STATUS_VAR);
2079
        return float32_default_nan;
2080
    }
2081
    if ( aExp == 0xFF ) {
2082
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2083
        return a;
2084
    }
2085

    
2086
    aExp -= 0x7F;
2087
    aSig |= 0x00800000;
2088
    zSign = aExp < 0;
2089
    zSig = aExp << 23;
2090

    
2091
    for (i = 1 << 22; i > 0; i >>= 1) {
2092
        aSig = ( (bits64)aSig * aSig ) >> 23;
2093
        if ( aSig & 0x01000000 ) {
2094
            aSig >>= 1;
2095
            zSig |= i;
2096
        }
2097
    }
2098

    
2099
    if ( zSign )
2100
        zSig = -zSig;
2101

    
2102
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2103
}
2104

    
2105
/*----------------------------------------------------------------------------
2106
| Returns 1 if the single-precision floating-point value `a' is equal to
2107
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2108
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109
*----------------------------------------------------------------------------*/
2110

    
2111
int float32_eq( float32 a, float32 b STATUS_PARAM )
2112
{
2113

    
2114
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116
       ) {
2117
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118
            float_raise( float_flag_invalid STATUS_VAR);
2119
        }
2120
        return 0;
2121
    }
2122
    return ( float32_val(a) == float32_val(b) ) ||
2123
            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2124

    
2125
}
2126

    
2127
/*----------------------------------------------------------------------------
2128
| Returns 1 if the single-precision floating-point value `a' is less than
2129
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2130
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2131
| Arithmetic.
2132
*----------------------------------------------------------------------------*/
2133

    
2134
int float32_le( float32 a, float32 b STATUS_PARAM )
2135
{
2136
    flag aSign, bSign;
2137
    bits32 av, bv;
2138

    
2139
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2140
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2141
       ) {
2142
        float_raise( float_flag_invalid STATUS_VAR);
2143
        return 0;
2144
    }
2145
    aSign = extractFloat32Sign( a );
2146
    bSign = extractFloat32Sign( b );
2147
    av = float32_val(a);
2148
    bv = float32_val(b);
2149
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2150
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2151

    
2152
}
2153

    
2154
/*----------------------------------------------------------------------------
2155
| Returns 1 if the single-precision floating-point value `a' is less than
2156
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2157
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2158
*----------------------------------------------------------------------------*/
2159

    
2160
int float32_lt( float32 a, float32 b STATUS_PARAM )
2161
{
2162
    flag aSign, bSign;
2163
    bits32 av, bv;
2164

    
2165
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2166
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2167
       ) {
2168
        float_raise( float_flag_invalid STATUS_VAR);
2169
        return 0;
2170
    }
2171
    aSign = extractFloat32Sign( a );
2172
    bSign = extractFloat32Sign( b );
2173
    av = float32_val(a);
2174
    bv = float32_val(b);
2175
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2176
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2177

    
2178
}
2179

    
2180
/*----------------------------------------------------------------------------
2181
| Returns 1 if the single-precision floating-point value `a' is equal to
2182
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2183
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2184
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185
*----------------------------------------------------------------------------*/
2186

    
2187
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2188
{
2189
    bits32 av, bv;
2190

    
2191
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2192
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2193
       ) {
2194
        float_raise( float_flag_invalid STATUS_VAR);
2195
        return 0;
2196
    }
2197
    av = float32_val(a);
2198
    bv = float32_val(b);
2199
    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2200

    
2201
}
2202

    
2203
/*----------------------------------------------------------------------------
2204
| Returns 1 if the single-precision floating-point value `a' is less than or
2205
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2206
| cause an exception.  Otherwise, the comparison is performed according to the
2207
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2208
*----------------------------------------------------------------------------*/
2209

    
2210
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2211
{
2212
    flag aSign, bSign;
2213
    bits32 av, bv;
2214

    
2215
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2216
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2217
       ) {
2218
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2219
            float_raise( float_flag_invalid STATUS_VAR);
2220
        }
2221
        return 0;
2222
    }
2223
    aSign = extractFloat32Sign( a );
2224
    bSign = extractFloat32Sign( b );
2225
    av = float32_val(a);
2226
    bv = float32_val(b);
2227
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2228
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2229

    
2230
}
2231

    
2232
/*----------------------------------------------------------------------------
2233
| Returns 1 if the single-precision floating-point value `a' is less than
2234
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2235
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2236
| Standard for Binary Floating-Point Arithmetic.
2237
*----------------------------------------------------------------------------*/
2238

    
2239
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2240
{
2241
    flag aSign, bSign;
2242
    bits32 av, bv;
2243

    
2244
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2246
       ) {
2247
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2248
            float_raise( float_flag_invalid STATUS_VAR);
2249
        }
2250
        return 0;
2251
    }
2252
    aSign = extractFloat32Sign( a );
2253
    bSign = extractFloat32Sign( b );
2254
    av = float32_val(a);
2255
    bv = float32_val(b);
2256
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2257
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2258

    
2259
}
2260

    
2261
/*----------------------------------------------------------------------------
2262
| Returns the result of converting the double-precision floating-point value
2263
| `a' to the 32-bit two's complement integer format.  The conversion is
2264
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2265
| Arithmetic---which means in particular that the conversion is rounded
2266
| according to the current rounding mode.  If `a' is a NaN, the largest
2267
| positive integer is returned.  Otherwise, if the conversion overflows, the
2268
| largest integer with the same sign as `a' is returned.
2269
*----------------------------------------------------------------------------*/
2270

    
2271
int32 float64_to_int32( float64 a STATUS_PARAM )
2272
{
2273
    flag aSign;
2274
    int16 aExp, shiftCount;
2275
    bits64 aSig;
2276

    
2277
    aSig = extractFloat64Frac( a );
2278
    aExp = extractFloat64Exp( a );
2279
    aSign = extractFloat64Sign( a );
2280
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2281
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2282
    shiftCount = 0x42C - aExp;
2283
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2284
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2285

    
2286
}
2287

    
2288
/*----------------------------------------------------------------------------
2289
| Returns the result of converting the double-precision floating-point value
2290
| `a' to the 32-bit two's complement integer format.  The conversion is
2291
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2292
| Arithmetic, except that the conversion is always rounded toward zero.
2293
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2294
| the conversion overflows, the largest integer with the same sign as `a' is
2295
| returned.
2296
*----------------------------------------------------------------------------*/
2297

    
2298
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2299
{
2300
    flag aSign;
2301
    int16 aExp, shiftCount;
2302
    bits64 aSig, savedASig;
2303
    int32 z;
2304

    
2305
    aSig = extractFloat64Frac( a );
2306
    aExp = extractFloat64Exp( a );
2307
    aSign = extractFloat64Sign( a );
2308
    if ( 0x41E < aExp ) {
2309
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2310
        goto invalid;
2311
    }
2312
    else if ( aExp < 0x3FF ) {
2313
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2314
        return 0;
2315
    }
2316
    aSig |= LIT64( 0x0010000000000000 );
2317
    shiftCount = 0x433 - aExp;
2318
    savedASig = aSig;
2319
    aSig >>= shiftCount;
2320
    z = aSig;
2321
    if ( aSign ) z = - z;
2322
    if ( ( z < 0 ) ^ aSign ) {
2323
 invalid:
2324
        float_raise( float_flag_invalid STATUS_VAR);
2325
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2326
    }
2327
    if ( ( aSig<<shiftCount ) != savedASig ) {
2328
        STATUS(float_exception_flags) |= float_flag_inexact;
2329
    }
2330
    return z;
2331

    
2332
}
2333

    
2334
/*----------------------------------------------------------------------------
2335
| Returns the result of converting the double-precision floating-point value
2336
| `a' to the 64-bit two's complement integer format.  The conversion is
2337
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2338
| Arithmetic---which means in particular that the conversion is rounded
2339
| according to the current rounding mode.  If `a' is a NaN, the largest
2340
| positive integer is returned.  Otherwise, if the conversion overflows, the
2341
| largest integer with the same sign as `a' is returned.
2342
*----------------------------------------------------------------------------*/
2343

    
2344
int64 float64_to_int64( float64 a STATUS_PARAM )
2345
{
2346
    flag aSign;
2347
    int16 aExp, shiftCount;
2348
    bits64 aSig, aSigExtra;
2349

    
2350
    aSig = extractFloat64Frac( a );
2351
    aExp = extractFloat64Exp( a );
2352
    aSign = extractFloat64Sign( a );
2353
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2354
    shiftCount = 0x433 - aExp;
2355
    if ( shiftCount <= 0 ) {
2356
        if ( 0x43E < aExp ) {
2357
            float_raise( float_flag_invalid STATUS_VAR);
2358
            if (    ! aSign
2359
                 || (    ( aExp == 0x7FF )
2360
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2361
               ) {
2362
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2363
            }
2364
            return (sbits64) LIT64( 0x8000000000000000 );
2365
        }
2366
        aSigExtra = 0;
2367
        aSig <<= - shiftCount;
2368
    }
2369
    else {
2370
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2371
    }
2372
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2373

    
2374
}
2375

    
2376
/*----------------------------------------------------------------------------
2377
| Returns the result of converting the double-precision floating-point value
2378
| `a' to the 64-bit two's complement integer format.  The conversion is
2379
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2380
| Arithmetic, except that the conversion is always rounded toward zero.
2381
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2382
| the conversion overflows, the largest integer with the same sign as `a' is
2383
| returned.
2384
*----------------------------------------------------------------------------*/
2385

    
2386
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2387
{
2388
    flag aSign;
2389
    int16 aExp, shiftCount;
2390
    bits64 aSig;
2391
    int64 z;
2392

    
2393
    aSig = extractFloat64Frac( a );
2394
    aExp = extractFloat64Exp( a );
2395
    aSign = extractFloat64Sign( a );
2396
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2397
    shiftCount = aExp - 0x433;
2398
    if ( 0 <= shiftCount ) {
2399
        if ( 0x43E <= aExp ) {
2400
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2401
                float_raise( float_flag_invalid STATUS_VAR);
2402
                if (    ! aSign
2403
                     || (    ( aExp == 0x7FF )
2404
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2405
                   ) {
2406
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2407
                }
2408
            }
2409
            return (sbits64) LIT64( 0x8000000000000000 );
2410
        }
2411
        z = aSig<<shiftCount;
2412
    }
2413
    else {
2414
        if ( aExp < 0x3FE ) {
2415
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2416
            return 0;
2417
        }
2418
        z = aSig>>( - shiftCount );
2419
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2420
            STATUS(float_exception_flags) |= float_flag_inexact;
2421
        }
2422
    }
2423
    if ( aSign ) z = - z;
2424
    return z;
2425

    
2426
}
2427

    
2428
/*----------------------------------------------------------------------------
2429
| Returns the result of converting the double-precision floating-point value
2430
| `a' to the single-precision floating-point format.  The conversion is
2431
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2432
| Arithmetic.
2433
*----------------------------------------------------------------------------*/
2434

    
2435
float32 float64_to_float32( float64 a STATUS_PARAM )
2436
{
2437
    flag aSign;
2438
    int16 aExp;
2439
    bits64 aSig;
2440
    bits32 zSig;
2441

    
2442
    aSig = extractFloat64Frac( a );
2443
    aExp = extractFloat64Exp( a );
2444
    aSign = extractFloat64Sign( a );
2445
    if ( aExp == 0x7FF ) {
2446
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2447
        return packFloat32( aSign, 0xFF, 0 );
2448
    }
2449
    shift64RightJamming( aSig, 22, &aSig );
2450
    zSig = aSig;
2451
    if ( aExp || zSig ) {
2452
        zSig |= 0x40000000;
2453
        aExp -= 0x381;
2454
    }
2455
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2456

    
2457
}
2458

    
2459

    
2460
/*----------------------------------------------------------------------------
2461
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2462
| half-precision floating-point value, returning the result.  After being
2463
| shifted into the proper positions, the three fields are simply added
2464
| together to form the result.  This means that any integer portion of `zSig'
2465
| will be added into the exponent.  Since a properly normalized significand
2466
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2467
| than the desired result exponent whenever `zSig' is a complete, normalized
2468
| significand.
2469
*----------------------------------------------------------------------------*/
2470
static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2471
{
2472
    return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2473
}
2474

    
2475
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2476
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2477
  
2478
float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2479
{
2480
    flag aSign;
2481
    int16 aExp;
2482
    bits32 aSig;
2483

    
2484
    aSign = a >> 15;
2485
    aExp = (a >> 10) & 0x1f;
2486
    aSig = a & 0x3ff;
2487

    
2488
    if (aExp == 0x1f && ieee) {
2489
        if (aSig) {
2490
            /* Make sure correct exceptions are raised.  */
2491
            float32ToCommonNaN(a STATUS_VAR);
2492
            aSig |= 0x200;
2493
        }
2494
        return packFloat32(aSign, 0xff, aSig << 13);
2495
    }
2496
    if (aExp == 0) {
2497
        int8 shiftCount;
2498

    
2499
        if (aSig == 0) {
2500
            return packFloat32(aSign, 0, 0);
2501
        }
2502

    
2503
        shiftCount = countLeadingZeros32( aSig ) - 21;
2504
        aSig = aSig << shiftCount;
2505
        aExp = -shiftCount;
2506
    }
2507
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2508
}
2509

    
2510
bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2511
{
2512
    flag aSign;
2513
    int16 aExp;
2514
    bits32 aSig;
2515
    bits32 mask;
2516
    bits32 increment;
2517
    int8 roundingMode;
2518

    
2519
    aSig = extractFloat32Frac( a );
2520
    aExp = extractFloat32Exp( a );
2521
    aSign = extractFloat32Sign( a );
2522
    if ( aExp == 0xFF ) {
2523
        if (aSig) {
2524
            /* Make sure correct exceptions are raised.  */
2525
            float32ToCommonNaN(a STATUS_VAR);
2526
            aSig |= 0x00400000;
2527
        }
2528
        return packFloat16(aSign, 0x1f, aSig >> 13);
2529
    }
2530
    if (aExp == 0 && aSign == 0) {
2531
        return packFloat16(aSign, 0, 0);
2532
    }
2533
    /* Decimal point between bits 22 and 23.  */
2534
    aSig |= 0x00800000;
2535
    aExp -= 0x7f;
2536
    if (aExp < -14) {
2537
        mask = 0x007fffff;
2538
        if (aExp < -24) {
2539
            aExp = -25;
2540
        } else {
2541
            mask >>= 24 + aExp;
2542
        }
2543
    } else {
2544
        mask = 0x00001fff;
2545
    }
2546
    if (aSig & mask) {
2547
        float_raise( float_flag_underflow STATUS_VAR );
2548
        roundingMode = STATUS(float_rounding_mode);
2549
        switch (roundingMode) {
2550
        case float_round_nearest_even:
2551
            increment = (mask + 1) >> 1;
2552
            if ((aSig & mask) == increment) {
2553
                increment = aSig & (increment << 1);
2554
            }
2555
            break;
2556
        case float_round_up:
2557
            increment = aSign ? 0 : mask;
2558
            break;
2559
        case float_round_down:
2560
            increment = aSign ? mask : 0;
2561
            break;
2562
        default: /* round_to_zero */
2563
            increment = 0;
2564
            break;
2565
        }
2566
        aSig += increment;
2567
        if (aSig >= 0x01000000) {
2568
            aSig >>= 1;
2569
            aExp++;
2570
        }
2571
    } else if (aExp < -14
2572
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2573
        float_raise( float_flag_underflow STATUS_VAR);
2574
    }
2575

    
2576
    if (ieee) {
2577
        if (aExp > 15) {
2578
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2579
            return packFloat16(aSign, 0x1f, 0);
2580
        }
2581
    } else {
2582
        if (aExp > 16) {
2583
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2584
            return packFloat16(aSign, 0x1f, 0x3ff);
2585
        }
2586
    }
2587
    if (aExp < -24) {
2588
        return packFloat16(aSign, 0, 0);
2589
    }
2590
    if (aExp < -14) {
2591
        aSig >>= -14 - aExp;
2592
        aExp = -14;
2593
    }
2594
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2595
}
2596

    
2597
#ifdef FLOATX80
2598

    
2599
/*----------------------------------------------------------------------------
2600
| Returns the result of converting the double-precision floating-point value
2601
| `a' to the extended double-precision floating-point format.  The conversion
2602
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2603
| Arithmetic.
2604
*----------------------------------------------------------------------------*/
2605

    
2606
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2607
{
2608
    flag aSign;
2609
    int16 aExp;
2610
    bits64 aSig;
2611

    
2612
    aSig = extractFloat64Frac( a );
2613
    aExp = extractFloat64Exp( a );
2614
    aSign = extractFloat64Sign( a );
2615
    if ( aExp == 0x7FF ) {
2616
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2617
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2618
    }
2619
    if ( aExp == 0 ) {
2620
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2621
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2622
    }
2623
    return
2624
        packFloatx80(
2625
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2626

    
2627
}
2628

    
2629
#endif
2630

    
2631
#ifdef FLOAT128
2632

    
2633
/*----------------------------------------------------------------------------
2634
| Returns the result of converting the double-precision floating-point value
2635
| `a' to the quadruple-precision floating-point format.  The conversion is
2636
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2637
| Arithmetic.
2638
*----------------------------------------------------------------------------*/
2639

    
2640
float128 float64_to_float128( float64 a STATUS_PARAM )
2641
{
2642
    flag aSign;
2643
    int16 aExp;
2644
    bits64 aSig, zSig0, zSig1;
2645

    
2646
    aSig = extractFloat64Frac( a );
2647
    aExp = extractFloat64Exp( a );
2648
    aSign = extractFloat64Sign( a );
2649
    if ( aExp == 0x7FF ) {
2650
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2651
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2652
    }
2653
    if ( aExp == 0 ) {
2654
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2655
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2656
        --aExp;
2657
    }
2658
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2659
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2660

    
2661
}
2662

    
2663
#endif
2664

    
2665
/*----------------------------------------------------------------------------
2666
| Rounds the double-precision floating-point value `a' to an integer, and
2667
| returns the result as a double-precision floating-point value.  The
2668
| operation is performed according to the IEC/IEEE Standard for Binary
2669
| Floating-Point Arithmetic.
2670
*----------------------------------------------------------------------------*/
2671

    
2672
float64 float64_round_to_int( float64 a STATUS_PARAM )
2673
{
2674
    flag aSign;
2675
    int16 aExp;
2676
    bits64 lastBitMask, roundBitsMask;
2677
    int8 roundingMode;
2678
    bits64 z;
2679

    
2680
    aExp = extractFloat64Exp( a );
2681
    if ( 0x433 <= aExp ) {
2682
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2683
            return propagateFloat64NaN( a, a STATUS_VAR );
2684
        }
2685
        return a;
2686
    }
2687
    if ( aExp < 0x3FF ) {
2688
        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2689
        STATUS(float_exception_flags) |= float_flag_inexact;
2690
        aSign = extractFloat64Sign( a );
2691
        switch ( STATUS(float_rounding_mode) ) {
2692
         case float_round_nearest_even:
2693
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2694
                return packFloat64( aSign, 0x3FF, 0 );
2695
            }
2696
            break;
2697
         case float_round_down:
2698
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2699
         case float_round_up:
2700
            return make_float64(
2701
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2702
        }
2703
        return packFloat64( aSign, 0, 0 );
2704
    }
2705
    lastBitMask = 1;
2706
    lastBitMask <<= 0x433 - aExp;
2707
    roundBitsMask = lastBitMask - 1;
2708
    z = float64_val(a);
2709
    roundingMode = STATUS(float_rounding_mode);
2710
    if ( roundingMode == float_round_nearest_even ) {
2711
        z += lastBitMask>>1;
2712
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2713
    }
2714
    else if ( roundingMode != float_round_to_zero ) {
2715
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2716
            z += roundBitsMask;
2717
        }
2718
    }
2719
    z &= ~ roundBitsMask;
2720
    if ( z != float64_val(a) )
2721
        STATUS(float_exception_flags) |= float_flag_inexact;
2722
    return make_float64(z);
2723

    
2724
}
2725

    
2726
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2727
{
2728
    int oldmode;
2729
    float64 res;
2730
    oldmode = STATUS(float_rounding_mode);
2731
    STATUS(float_rounding_mode) = float_round_to_zero;
2732
    res = float64_round_to_int(a STATUS_VAR);
2733
    STATUS(float_rounding_mode) = oldmode;
2734
    return res;
2735
}
2736

    
2737
/*----------------------------------------------------------------------------
2738
| Returns the result of adding the absolute values of the double-precision
2739
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2740
| before being returned.  `zSign' is ignored if the result is a NaN.
2741
| The addition is performed according to the IEC/IEEE Standard for Binary
2742
| Floating-Point Arithmetic.
2743
*----------------------------------------------------------------------------*/
2744

    
2745
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2746
{
2747
    int16 aExp, bExp, zExp;
2748
    bits64 aSig, bSig, zSig;
2749
    int16 expDiff;
2750

    
2751
    aSig = extractFloat64Frac( a );
2752
    aExp = extractFloat64Exp( a );
2753
    bSig = extractFloat64Frac( b );
2754
    bExp = extractFloat64Exp( b );
2755
    expDiff = aExp - bExp;
2756
    aSig <<= 9;
2757
    bSig <<= 9;
2758
    if ( 0 < expDiff ) {
2759
        if ( aExp == 0x7FF ) {
2760
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2761
            return a;
2762
        }
2763
        if ( bExp == 0 ) {
2764
            --expDiff;
2765
        }
2766
        else {
2767
            bSig |= LIT64( 0x2000000000000000 );
2768
        }
2769
        shift64RightJamming( bSig, expDiff, &bSig );
2770
        zExp = aExp;
2771
    }
2772
    else if ( expDiff < 0 ) {
2773
        if ( bExp == 0x7FF ) {
2774
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775
            return packFloat64( zSign, 0x7FF, 0 );
2776
        }
2777
        if ( aExp == 0 ) {
2778
            ++expDiff;
2779
        }
2780
        else {
2781
            aSig |= LIT64( 0x2000000000000000 );
2782
        }
2783
        shift64RightJamming( aSig, - expDiff, &aSig );
2784
        zExp = bExp;
2785
    }
2786
    else {
2787
        if ( aExp == 0x7FF ) {
2788
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2789
            return a;
2790
        }
2791
        if ( aExp == 0 ) {
2792
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2793
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2794
        }
2795
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2796
        zExp = aExp;
2797
        goto roundAndPack;
2798
    }
2799
    aSig |= LIT64( 0x2000000000000000 );
2800
    zSig = ( aSig + bSig )<<1;
2801
    --zExp;
2802
    if ( (sbits64) zSig < 0 ) {
2803
        zSig = aSig + bSig;
2804
        ++zExp;
2805
    }
2806
 roundAndPack:
2807
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2808

    
2809
}
2810

    
2811
/*----------------------------------------------------------------------------
2812
| Returns the result of subtracting the absolute values of the double-
2813
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2814
| difference is negated before being returned.  `zSign' is ignored if the
2815
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2816
| Standard for Binary Floating-Point Arithmetic.
2817
*----------------------------------------------------------------------------*/
2818

    
2819
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2820
{
2821
    int16 aExp, bExp, zExp;
2822
    bits64 aSig, bSig, zSig;
2823
    int16 expDiff;
2824

    
2825
    aSig = extractFloat64Frac( a );
2826
    aExp = extractFloat64Exp( a );
2827
    bSig = extractFloat64Frac( b );
2828
    bExp = extractFloat64Exp( b );
2829
    expDiff = aExp - bExp;
2830
    aSig <<= 10;
2831
    bSig <<= 10;
2832
    if ( 0 < expDiff ) goto aExpBigger;
2833
    if ( expDiff < 0 ) goto bExpBigger;
2834
    if ( aExp == 0x7FF ) {
2835
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2836
        float_raise( float_flag_invalid STATUS_VAR);
2837
        return float64_default_nan;
2838
    }
2839
    if ( aExp == 0 ) {
2840
        aExp = 1;
2841
        bExp = 1;
2842
    }
2843
    if ( bSig < aSig ) goto aBigger;
2844
    if ( aSig < bSig ) goto bBigger;
2845
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2846
 bExpBigger:
2847
    if ( bExp == 0x7FF ) {
2848
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2849
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2850
    }
2851
    if ( aExp == 0 ) {
2852
        ++expDiff;
2853
    }
2854
    else {
2855
        aSig |= LIT64( 0x4000000000000000 );
2856
    }
2857
    shift64RightJamming( aSig, - expDiff, &aSig );
2858
    bSig |= LIT64( 0x4000000000000000 );
2859
 bBigger:
2860
    zSig = bSig - aSig;
2861
    zExp = bExp;
2862
    zSign ^= 1;
2863
    goto normalizeRoundAndPack;
2864
 aExpBigger:
2865
    if ( aExp == 0x7FF ) {
2866
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2867
        return a;
2868
    }
2869
    if ( bExp == 0 ) {
2870
        --expDiff;
2871
    }
2872
    else {
2873
        bSig |= LIT64( 0x4000000000000000 );
2874
    }
2875
    shift64RightJamming( bSig, expDiff, &bSig );
2876
    aSig |= LIT64( 0x4000000000000000 );
2877
 aBigger:
2878
    zSig = aSig - bSig;
2879
    zExp = aExp;
2880
 normalizeRoundAndPack:
2881
    --zExp;
2882
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2883

    
2884
}
2885

    
2886
/*----------------------------------------------------------------------------
2887
| Returns the result of adding the double-precision floating-point values `a'
2888
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2889
| Binary Floating-Point Arithmetic.
2890
*----------------------------------------------------------------------------*/
2891

    
2892
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2893
{
2894
    flag aSign, bSign;
2895

    
2896
    aSign = extractFloat64Sign( a );
2897
    bSign = extractFloat64Sign( b );
2898
    if ( aSign == bSign ) {
2899
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2900
    }
2901
    else {
2902
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2903
    }
2904

    
2905
}
2906

    
2907
/*----------------------------------------------------------------------------
2908
| Returns the result of subtracting the double-precision floating-point values
2909
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2910
| for Binary Floating-Point Arithmetic.
2911
*----------------------------------------------------------------------------*/
2912

    
2913
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2914
{
2915
    flag aSign, bSign;
2916

    
2917
    aSign = extractFloat64Sign( a );
2918
    bSign = extractFloat64Sign( b );
2919
    if ( aSign == bSign ) {
2920
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2921
    }
2922
    else {
2923
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2924
    }
2925

    
2926
}
2927

    
2928
/*----------------------------------------------------------------------------
2929
| Returns the result of multiplying the double-precision floating-point values
2930
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2931
| for Binary Floating-Point Arithmetic.
2932
*----------------------------------------------------------------------------*/
2933

    
2934
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2935
{
2936
    flag aSign, bSign, zSign;
2937
    int16 aExp, bExp, zExp;
2938
    bits64 aSig, bSig, zSig0, zSig1;
2939

    
2940
    aSig = extractFloat64Frac( a );
2941
    aExp = extractFloat64Exp( a );
2942
    aSign = extractFloat64Sign( a );
2943
    bSig = extractFloat64Frac( b );
2944
    bExp = extractFloat64Exp( b );
2945
    bSign = extractFloat64Sign( b );
2946
    zSign = aSign ^ bSign;
2947
    if ( aExp == 0x7FF ) {
2948
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2949
            return propagateFloat64NaN( a, b STATUS_VAR );
2950
        }
2951
        if ( ( bExp | bSig ) == 0 ) {
2952
            float_raise( float_flag_invalid STATUS_VAR);
2953
            return float64_default_nan;
2954
        }
2955
        return packFloat64( zSign, 0x7FF, 0 );
2956
    }
2957
    if ( bExp == 0x7FF ) {
2958
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2959
        if ( ( aExp | aSig ) == 0 ) {
2960
            float_raise( float_flag_invalid STATUS_VAR);
2961
            return float64_default_nan;
2962
        }
2963
        return packFloat64( zSign, 0x7FF, 0 );
2964
    }
2965
    if ( aExp == 0 ) {
2966
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2967
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968
    }
2969
    if ( bExp == 0 ) {
2970
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2971
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2972
    }
2973
    zExp = aExp + bExp - 0x3FF;
2974
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2975
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2976
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2977
    zSig0 |= ( zSig1 != 0 );
2978
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2979
        zSig0 <<= 1;
2980
        --zExp;
2981
    }
2982
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2983

    
2984
}
2985

    
2986
/*----------------------------------------------------------------------------
2987
| Returns the result of dividing the double-precision floating-point value `a'
2988
| by the corresponding value `b'.  The operation is performed according to
2989
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2990
*----------------------------------------------------------------------------*/
2991

    
2992
float64 float64_div( float64 a, float64 b STATUS_PARAM )
2993
{
2994
    flag aSign, bSign, zSign;
2995
    int16 aExp, bExp, zExp;
2996
    bits64 aSig, bSig, zSig;
2997
    bits64 rem0, rem1;
2998
    bits64 term0, term1;
2999

    
3000
    aSig = extractFloat64Frac( a );
3001
    aExp = extractFloat64Exp( a );
3002
    aSign = extractFloat64Sign( a );
3003
    bSig = extractFloat64Frac( b );
3004
    bExp = extractFloat64Exp( b );
3005
    bSign = extractFloat64Sign( b );
3006
    zSign = aSign ^ bSign;
3007
    if ( aExp == 0x7FF ) {
3008
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3009
        if ( bExp == 0x7FF ) {
3010
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3011
            float_raise( float_flag_invalid STATUS_VAR);
3012
            return float64_default_nan;
3013
        }
3014
        return packFloat64( zSign, 0x7FF, 0 );
3015
    }
3016
    if ( bExp == 0x7FF ) {
3017
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3018
        return packFloat64( zSign, 0, 0 );
3019
    }
3020
    if ( bExp == 0 ) {
3021
        if ( bSig == 0 ) {
3022
            if ( ( aExp | aSig ) == 0 ) {
3023
                float_raise( float_flag_invalid STATUS_VAR);
3024
                return float64_default_nan;
3025
            }
3026
            float_raise( float_flag_divbyzero STATUS_VAR);
3027
            return packFloat64( zSign, 0x7FF, 0 );
3028
        }
3029
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3030
    }
3031
    if ( aExp == 0 ) {
3032
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3033
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034
    }
3035
    zExp = aExp - bExp + 0x3FD;
3036
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3037
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3038
    if ( bSig <= ( aSig + aSig ) ) {
3039
        aSig >>= 1;
3040
        ++zExp;
3041
    }
3042
    zSig = estimateDiv128To64( aSig, 0, bSig );
3043
    if ( ( zSig & 0x1FF ) <= 2 ) {
3044
        mul64To128( bSig, zSig, &term0, &term1 );
3045
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3046
        while ( (sbits64) rem0 < 0 ) {
3047
            --zSig;
3048
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3049
        }
3050
        zSig |= ( rem1 != 0 );
3051
    }
3052
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3053

    
3054
}
3055

    
3056
/*----------------------------------------------------------------------------
3057
| Returns the remainder of the double-precision floating-point value `a'
3058
| with respect to the corresponding value `b'.  The operation is performed
3059
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3060
*----------------------------------------------------------------------------*/
3061

    
3062
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3063
{
3064
    flag aSign, zSign;
3065
    int16 aExp, bExp, expDiff;
3066
    bits64 aSig, bSig;
3067
    bits64 q, alternateASig;
3068
    sbits64 sigMean;
3069

    
3070
    aSig = extractFloat64Frac( a );
3071
    aExp = extractFloat64Exp( a );
3072
    aSign = extractFloat64Sign( a );
3073
    bSig = extractFloat64Frac( b );
3074
    bExp = extractFloat64Exp( b );
3075
    if ( aExp == 0x7FF ) {
3076
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3077
            return propagateFloat64NaN( a, b STATUS_VAR );
3078
        }
3079
        float_raise( float_flag_invalid STATUS_VAR);
3080
        return float64_default_nan;
3081
    }
3082
    if ( bExp == 0x7FF ) {
3083
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3084
        return a;
3085
    }
3086
    if ( bExp == 0 ) {
3087
        if ( bSig == 0 ) {
3088
            float_raise( float_flag_invalid STATUS_VAR);
3089
            return float64_default_nan;
3090
        }
3091
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3092
    }
3093
    if ( aExp == 0 ) {
3094
        if ( aSig == 0 ) return a;
3095
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3096
    }
3097
    expDiff = aExp - bExp;
3098
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3099
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3100
    if ( expDiff < 0 ) {
3101
        if ( expDiff < -1 ) return a;
3102
        aSig >>= 1;
3103
    }
3104
    q = ( bSig <= aSig );
3105
    if ( q ) aSig -= bSig;
3106
    expDiff -= 64;
3107
    while ( 0 < expDiff ) {
3108
        q = estimateDiv128To64( aSig, 0, bSig );
3109
        q = ( 2 < q ) ? q - 2 : 0;
3110
        aSig = - ( ( bSig>>2 ) * q );
3111
        expDiff -= 62;
3112
    }
3113
    expDiff += 64;
3114
    if ( 0 < expDiff ) {
3115
        q = estimateDiv128To64( aSig, 0, bSig );
3116
        q = ( 2 < q ) ? q - 2 : 0;
3117
        q >>= 64 - expDiff;
3118
        bSig >>= 2;
3119
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3120
    }
3121
    else {
3122
        aSig >>= 2;
3123
        bSig >>= 2;
3124
    }
3125
    do {
3126
        alternateASig = aSig;
3127
        ++q;
3128
        aSig -= bSig;
3129
    } while ( 0 <= (sbits64) aSig );
3130
    sigMean = aSig + alternateASig;
3131
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3132
        aSig = alternateASig;
3133
    }
3134
    zSign = ( (sbits64) aSig < 0 );
3135
    if ( zSign ) aSig = - aSig;
3136
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3137

    
3138
}
3139

    
3140
/*----------------------------------------------------------------------------
3141
| Returns the square root of the double-precision floating-point value `a'.
3142
| The operation is performed according to the IEC/IEEE Standard for Binary
3143
| Floating-Point Arithmetic.
3144
*----------------------------------------------------------------------------*/
3145

    
3146
float64 float64_sqrt( float64 a STATUS_PARAM )
3147
{
3148
    flag aSign;
3149
    int16 aExp, zExp;
3150
    bits64 aSig, zSig, doubleZSig;
3151
    bits64 rem0, rem1, term0, term1;
3152

    
3153
    aSig = extractFloat64Frac( a );
3154
    aExp = extractFloat64Exp( a );
3155
    aSign = extractFloat64Sign( a );
3156
    if ( aExp == 0x7FF ) {
3157
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3158
        if ( ! aSign ) return a;
3159
        float_raise( float_flag_invalid STATUS_VAR);
3160
        return float64_default_nan;
3161
    }
3162
    if ( aSign ) {
3163
        if ( ( aExp | aSig ) == 0 ) return a;
3164
        float_raise( float_flag_invalid STATUS_VAR);
3165
        return float64_default_nan;
3166
    }
3167
    if ( aExp == 0 ) {
3168
        if ( aSig == 0 ) return float64_zero;
3169
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3170
    }
3171
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3172
    aSig |= LIT64( 0x0010000000000000 );
3173
    zSig = estimateSqrt32( aExp, aSig>>21 );
3174
    aSig <<= 9 - ( aExp & 1 );
3175
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3176
    if ( ( zSig & 0x1FF ) <= 5 ) {
3177
        doubleZSig = zSig<<1;
3178
        mul64To128( zSig, zSig, &term0, &term1 );
3179
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3180
        while ( (sbits64) rem0 < 0 ) {
3181
            --zSig;
3182
            doubleZSig -= 2;
3183
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3184
        }
3185
        zSig |= ( ( rem0 | rem1 ) != 0 );
3186
    }
3187
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3188

    
3189
}
3190

    
3191
/*----------------------------------------------------------------------------
3192
| Returns the binary log of the double-precision floating-point value `a'.
3193
| The operation is performed according to the IEC/IEEE Standard for Binary
3194
| Floating-Point Arithmetic.
3195
*----------------------------------------------------------------------------*/
3196
float64 float64_log2( float64 a STATUS_PARAM )
3197
{
3198
    flag aSign, zSign;
3199
    int16 aExp;
3200
    bits64 aSig, aSig0, aSig1, zSig, i;
3201

    
3202
    aSig = extractFloat64Frac( a );
3203
    aExp = extractFloat64Exp( a );
3204
    aSign = extractFloat64Sign( a );
3205

    
3206
    if ( aExp == 0 ) {
3207
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3208
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3209
    }
3210
    if ( aSign ) {
3211
        float_raise( float_flag_invalid STATUS_VAR);
3212
        return float64_default_nan;
3213
    }
3214
    if ( aExp == 0x7FF ) {
3215
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3216
        return a;
3217
    }
3218

    
3219
    aExp -= 0x3FF;
3220
    aSig |= LIT64( 0x0010000000000000 );
3221
    zSign = aExp < 0;
3222
    zSig = (bits64)aExp << 52;
3223
    for (i = 1LL << 51; i > 0; i >>= 1) {
3224
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3225
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3226
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3227
            aSig >>= 1;
3228
            zSig |= i;
3229
        }
3230
    }
3231

    
3232
    if ( zSign )
3233
        zSig = -zSig;
3234
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3235
}
3236

    
3237
/*----------------------------------------------------------------------------
3238
| Returns 1 if the double-precision floating-point value `a' is equal to the
3239
| corresponding value `b', and 0 otherwise.  The comparison is performed
3240
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3241
*----------------------------------------------------------------------------*/
3242

    
3243
int float64_eq( float64 a, float64 b STATUS_PARAM )
3244
{
3245
    bits64 av, bv;
3246

    
3247
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3248
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3249
       ) {
3250
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3251
            float_raise( float_flag_invalid STATUS_VAR);
3252
        }
3253
        return 0;
3254
    }
3255
    av = float64_val(a);
3256
    bv = float64_val(b);
3257
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3258

    
3259
}
3260

    
3261
/*----------------------------------------------------------------------------
3262
| Returns 1 if the double-precision floating-point value `a' is less than or
3263
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3264
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3265
| Arithmetic.
3266
*----------------------------------------------------------------------------*/
3267

    
3268
int float64_le( float64 a, float64 b STATUS_PARAM )
3269
{
3270
    flag aSign, bSign;
3271
    bits64 av, bv;
3272

    
3273
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3274
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3275
       ) {
3276
        float_raise( float_flag_invalid STATUS_VAR);
3277
        return 0;
3278
    }
3279
    aSign = extractFloat64Sign( a );
3280
    bSign = extractFloat64Sign( b );
3281
    av = float64_val(a);
3282
    bv = float64_val(b);
3283
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3284
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3285

    
3286
}
3287

    
3288
/*----------------------------------------------------------------------------
3289
| Returns 1 if the double-precision floating-point value `a' is less than
3290
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3291
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3292
*----------------------------------------------------------------------------*/
3293

    
3294
int float64_lt( float64 a, float64 b STATUS_PARAM )
3295
{
3296
    flag aSign, bSign;
3297
    bits64 av, bv;
3298

    
3299
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3300
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3301
       ) {
3302
        float_raise( float_flag_invalid STATUS_VAR);
3303
        return 0;
3304
    }
3305
    aSign = extractFloat64Sign( a );
3306
    bSign = extractFloat64Sign( b );
3307
    av = float64_val(a);
3308
    bv = float64_val(b);
3309
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3310
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3311

    
3312
}
3313

    
3314
/*----------------------------------------------------------------------------
3315
| Returns 1 if the double-precision floating-point value `a' is equal to the
3316
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3317
| if either operand is a NaN.  Otherwise, the comparison is performed
3318
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319
*----------------------------------------------------------------------------*/
3320

    
3321
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3322
{
3323
    bits64 av, bv;
3324

    
3325
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3326
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3327
       ) {
3328
        float_raise( float_flag_invalid STATUS_VAR);
3329
        return 0;
3330
    }
3331
    av = float64_val(a);
3332
    bv = float64_val(b);
3333
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3334

    
3335
}
3336

    
3337
/*----------------------------------------------------------------------------
3338
| Returns 1 if the double-precision floating-point value `a' is less than or
3339
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3340
| cause an exception.  Otherwise, the comparison is performed according to the
3341
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3342
*----------------------------------------------------------------------------*/
3343

    
3344
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3345
{
3346
    flag aSign, bSign;
3347
    bits64 av, bv;
3348

    
3349
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3350
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3351
       ) {
3352
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3353
            float_raise( float_flag_invalid STATUS_VAR);
3354
        }
3355
        return 0;
3356
    }
3357
    aSign = extractFloat64Sign( a );
3358
    bSign = extractFloat64Sign( b );
3359
    av = float64_val(a);
3360
    bv = float64_val(b);
3361
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3362
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3363

    
3364
}
3365

    
3366
/*----------------------------------------------------------------------------
3367
| Returns 1 if the double-precision floating-point value `a' is less than
3368
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3369
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3370
| Standard for Binary Floating-Point Arithmetic.
3371
*----------------------------------------------------------------------------*/
3372

    
3373
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3374
{
3375
    flag aSign, bSign;
3376
    bits64 av, bv;
3377

    
3378
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3380
       ) {
3381
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3382
            float_raise( float_flag_invalid STATUS_VAR);
3383
        }
3384
        return 0;
3385
    }
3386
    aSign = extractFloat64Sign( a );
3387
    bSign = extractFloat64Sign( b );
3388
    av = float64_val(a);
3389
    bv = float64_val(b);
3390
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3391
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3392

    
3393
}
3394

    
3395
#ifdef FLOATX80
3396

    
3397
/*----------------------------------------------------------------------------
3398
| Returns the result of converting the extended double-precision floating-
3399
| point value `a' to the 32-bit two's complement integer format.  The
3400
| conversion is performed according to the IEC/IEEE Standard for Binary
3401
| Floating-Point Arithmetic---which means in particular that the conversion
3402
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3403
| largest positive integer is returned.  Otherwise, if the conversion
3404
| overflows, the largest integer with the same sign as `a' is returned.
3405
*----------------------------------------------------------------------------*/
3406

    
3407
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3408
{
3409
    flag aSign;
3410
    int32 aExp, shiftCount;
3411
    bits64 aSig;
3412

    
3413
    aSig = extractFloatx80Frac( a );
3414
    aExp = extractFloatx80Exp( a );
3415
    aSign = extractFloatx80Sign( a );
3416
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3417
    shiftCount = 0x4037 - aExp;
3418
    if ( shiftCount <= 0 ) shiftCount = 1;
3419
    shift64RightJamming( aSig, shiftCount, &aSig );
3420
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3421

    
3422
}
3423

    
3424
/*----------------------------------------------------------------------------
3425
| Returns the result of converting the extended double-precision floating-
3426
| point value `a' to the 32-bit two's complement integer format.  The
3427
| conversion is performed according to the IEC/IEEE Standard for Binary
3428
| Floating-Point Arithmetic, except that the conversion is always rounded
3429
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3430
| Otherwise, if the conversion overflows, the largest integer with the same
3431
| sign as `a' is returned.
3432
*----------------------------------------------------------------------------*/
3433

    
3434
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3435
{
3436
    flag aSign;
3437
    int32 aExp, shiftCount;
3438
    bits64 aSig, savedASig;
3439
    int32 z;
3440

    
3441
    aSig = extractFloatx80Frac( a );
3442
    aExp = extractFloatx80Exp( a );
3443
    aSign = extractFloatx80Sign( a );
3444
    if ( 0x401E < aExp ) {
3445
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3446
        goto invalid;
3447
    }
3448
    else if ( aExp < 0x3FFF ) {
3449
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3450
        return 0;
3451
    }
3452
    shiftCount = 0x403E - aExp;
3453
    savedASig = aSig;
3454
    aSig >>= shiftCount;
3455
    z = aSig;
3456
    if ( aSign ) z = - z;
3457
    if ( ( z < 0 ) ^ aSign ) {
3458
 invalid:
3459
        float_raise( float_flag_invalid STATUS_VAR);
3460
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3461
    }
3462
    if ( ( aSig<<shiftCount ) != savedASig ) {
3463
        STATUS(float_exception_flags) |= float_flag_inexact;
3464
    }
3465
    return z;
3466

    
3467
}
3468

    
3469
/*----------------------------------------------------------------------------
3470
| Returns the result of converting the extended double-precision floating-
3471
| point value `a' to the 64-bit two's complement integer format.  The
3472
| conversion is performed according to the IEC/IEEE Standard for Binary
3473
| Floating-Point Arithmetic---which means in particular that the conversion
3474
| is rounded according to the current rounding mode.  If `a' is a NaN,
3475
| the largest positive integer is returned.  Otherwise, if the conversion
3476
| overflows, the largest integer with the same sign as `a' is returned.
3477
*----------------------------------------------------------------------------*/
3478

    
3479
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3480
{
3481
    flag aSign;
3482
    int32 aExp, shiftCount;
3483
    bits64 aSig, aSigExtra;
3484

    
3485
    aSig = extractFloatx80Frac( a );
3486
    aExp = extractFloatx80Exp( a );
3487
    aSign = extractFloatx80Sign( a );
3488
    shiftCount = 0x403E - aExp;
3489
    if ( shiftCount <= 0 ) {
3490
        if ( shiftCount ) {
3491
            float_raise( float_flag_invalid STATUS_VAR);
3492
            if (    ! aSign
3493
                 || (    ( aExp == 0x7FFF )
3494
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3495
               ) {
3496
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3497
            }
3498
            return (sbits64) LIT64( 0x8000000000000000 );
3499
        }
3500
        aSigExtra = 0;
3501
    }
3502
    else {
3503
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3504
    }
3505
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3506

    
3507
}
3508

    
3509
/*----------------------------------------------------------------------------
3510
| Returns the result of converting the extended double-precision floating-
3511
| point value `a' to the 64-bit two's complement integer format.  The
3512
| conversion is performed according to the IEC/IEEE Standard for Binary
3513
| Floating-Point Arithmetic, except that the conversion is always rounded
3514
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3515
| Otherwise, if the conversion overflows, the largest integer with the same
3516
| sign as `a' is returned.
3517
*----------------------------------------------------------------------------*/
3518

    
3519
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3520
{
3521
    flag aSign;
3522
    int32 aExp, shiftCount;
3523
    bits64 aSig;
3524
    int64 z;
3525

    
3526
    aSig = extractFloatx80Frac( a );
3527
    aExp = extractFloatx80Exp( a );
3528
    aSign = extractFloatx80Sign( a );
3529
    shiftCount = aExp - 0x403E;
3530
    if ( 0 <= shiftCount ) {
3531
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3532
        if ( ( a.high != 0xC03E ) || aSig ) {
3533
            float_raise( float_flag_invalid STATUS_VAR);
3534
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3535
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3536
            }
3537
        }
3538
        return (sbits64) LIT64( 0x8000000000000000 );
3539
    }
3540
    else if ( aExp < 0x3FFF ) {
3541
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3542
        return 0;
3543
    }
3544
    z = aSig>>( - shiftCount );
3545
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3546
        STATUS(float_exception_flags) |= float_flag_inexact;
3547
    }
3548
    if ( aSign ) z = - z;
3549
    return z;
3550

    
3551
}
3552

    
3553
/*----------------------------------------------------------------------------
3554
| Returns the result of converting the extended double-precision floating-
3555
| point value `a' to the single-precision floating-point format.  The
3556
| conversion is performed according to the IEC/IEEE Standard for Binary
3557
| Floating-Point Arithmetic.
3558
*----------------------------------------------------------------------------*/
3559

    
3560
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3561
{
3562
    flag aSign;
3563
    int32 aExp;
3564
    bits64 aSig;
3565

    
3566
    aSig = extractFloatx80Frac( a );
3567
    aExp = extractFloatx80Exp( a );
3568
    aSign = extractFloatx80Sign( a );
3569
    if ( aExp == 0x7FFF ) {
3570
        if ( (bits64) ( aSig<<1 ) ) {
3571
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3572
        }
3573
        return packFloat32( aSign, 0xFF, 0 );
3574
    }
3575
    shift64RightJamming( aSig, 33, &aSig );
3576
    if ( aExp || aSig ) aExp -= 0x3F81;
3577
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3578

    
3579
}
3580

    
3581
/*----------------------------------------------------------------------------
3582
| Returns the result of converting the extended double-precision floating-
3583
| point value `a' to the double-precision floating-point format.  The
3584
| conversion is performed according to the IEC/IEEE Standard for Binary
3585
| Floating-Point Arithmetic.
3586
*----------------------------------------------------------------------------*/
3587

    
3588
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3589
{
3590
    flag aSign;
3591
    int32 aExp;
3592
    bits64 aSig, zSig;
3593

    
3594
    aSig = extractFloatx80Frac( a );
3595
    aExp = extractFloatx80Exp( a );
3596
    aSign = extractFloatx80Sign( a );
3597
    if ( aExp == 0x7FFF ) {
3598
        if ( (bits64) ( aSig<<1 ) ) {
3599
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3600
        }
3601
        return packFloat64( aSign, 0x7FF, 0 );
3602
    }
3603
    shift64RightJamming( aSig, 1, &zSig );
3604
    if ( aExp || aSig ) aExp -= 0x3C01;
3605
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3606

    
3607
}
3608

    
3609
#ifdef FLOAT128
3610

    
3611
/*----------------------------------------------------------------------------
3612
| Returns the result of converting the extended double-precision floating-
3613
| point value `a' to the quadruple-precision floating-point format.  The
3614
| conversion is performed according to the IEC/IEEE Standard for Binary
3615
| Floating-Point Arithmetic.
3616
*----------------------------------------------------------------------------*/
3617

    
3618
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3619
{
3620
    flag aSign;
3621
    int16 aExp;
3622
    bits64 aSig, zSig0, zSig1;
3623

    
3624
    aSig = extractFloatx80Frac( a );
3625
    aExp = extractFloatx80Exp( a );
3626
    aSign = extractFloatx80Sign( a );
3627
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3628
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3629
    }
3630
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3631
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3632

    
3633
}
3634

    
3635
#endif
3636

    
3637
/*----------------------------------------------------------------------------
3638
| Rounds the extended double-precision floating-point value `a' to an integer,
3639
| and returns the result as an extended quadruple-precision floating-point
3640
| value.  The operation is performed according to the IEC/IEEE Standard for
3641
| Binary Floating-Point Arithmetic.
3642
*----------------------------------------------------------------------------*/
3643

    
3644
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3645
{
3646
    flag aSign;
3647
    int32 aExp;
3648
    bits64 lastBitMask, roundBitsMask;
3649
    int8 roundingMode;
3650
    floatx80 z;
3651

    
3652
    aExp = extractFloatx80Exp( a );
3653
    if ( 0x403E <= aExp ) {
3654
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3655
            return propagateFloatx80NaN( a, a STATUS_VAR );
3656
        }
3657
        return a;
3658
    }
3659
    if ( aExp < 0x3FFF ) {
3660
        if (    ( aExp == 0 )
3661
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3662
            return a;
3663
        }
3664
        STATUS(float_exception_flags) |= float_flag_inexact;
3665
        aSign = extractFloatx80Sign( a );
3666
        switch ( STATUS(float_rounding_mode) ) {
3667
         case float_round_nearest_even:
3668
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3669
               ) {
3670
                return
3671
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3672
            }
3673
            break;
3674
         case float_round_down:
3675
            return
3676
                  aSign ?
3677
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3678
                : packFloatx80( 0, 0, 0 );
3679
         case float_round_up:
3680
            return
3681
                  aSign ? packFloatx80( 1, 0, 0 )
3682
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3683
        }
3684
        return packFloatx80( aSign, 0, 0 );
3685
    }
3686
    lastBitMask = 1;
3687
    lastBitMask <<= 0x403E - aExp;
3688
    roundBitsMask = lastBitMask - 1;
3689
    z = a;
3690
    roundingMode = STATUS(float_rounding_mode);
3691
    if ( roundingMode == float_round_nearest_even ) {
3692
        z.low += lastBitMask>>1;
3693
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3694
    }
3695
    else if ( roundingMode != float_round_to_zero ) {
3696
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3697
            z.low += roundBitsMask;
3698
        }
3699
    }
3700
    z.low &= ~ roundBitsMask;
3701
    if ( z.low == 0 ) {
3702
        ++z.high;
3703
        z.low = LIT64( 0x8000000000000000 );
3704
    }
3705
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3706
    return z;
3707

    
3708
}
3709

    
3710
/*----------------------------------------------------------------------------
3711
| Returns the result of adding the absolute values of the extended double-
3712
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3713
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3714
| The addition is performed according to the IEC/IEEE Standard for Binary
3715
| Floating-Point Arithmetic.
3716
*----------------------------------------------------------------------------*/
3717

    
3718
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3719
{
3720
    int32 aExp, bExp, zExp;
3721
    bits64 aSig, bSig, zSig0, zSig1;
3722
    int32 expDiff;
3723

    
3724
    aSig = extractFloatx80Frac( a );
3725
    aExp = extractFloatx80Exp( a );
3726
    bSig = extractFloatx80Frac( b );
3727
    bExp = extractFloatx80Exp( b );
3728
    expDiff = aExp - bExp;
3729
    if ( 0 < expDiff ) {
3730
        if ( aExp == 0x7FFF ) {
3731
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3732
            return a;
3733
        }
3734
        if ( bExp == 0 ) --expDiff;
3735
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3736
        zExp = aExp;
3737
    }
3738
    else if ( expDiff < 0 ) {
3739
        if ( bExp == 0x7FFF ) {
3740
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3741
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3742
        }
3743
        if ( aExp == 0 ) ++expDiff;
3744
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3745
        zExp = bExp;
3746
    }
3747
    else {
3748
        if ( aExp == 0x7FFF ) {
3749
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3750
                return propagateFloatx80NaN( a, b STATUS_VAR );
3751
            }
3752
            return a;
3753
        }
3754
        zSig1 = 0;
3755
        zSig0 = aSig + bSig;
3756
        if ( aExp == 0 ) {
3757
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3758
            goto roundAndPack;
3759
        }
3760
        zExp = aExp;
3761
        goto shiftRight1;
3762
    }
3763
    zSig0 = aSig + bSig;
3764
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3765
 shiftRight1:
3766
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3767
    zSig0 |= LIT64( 0x8000000000000000 );
3768
    ++zExp;
3769
 roundAndPack:
3770
    return
3771
        roundAndPackFloatx80(
3772
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3773

    
3774
}
3775

    
3776
/*----------------------------------------------------------------------------
3777
| Returns the result of subtracting the absolute values of the extended
3778
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3779
| difference is negated before being returned.  `zSign' is ignored if the
3780
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3781
| Standard for Binary Floating-Point Arithmetic.
3782
*----------------------------------------------------------------------------*/
3783

    
3784
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3785
{
3786
    int32 aExp, bExp, zExp;
3787
    bits64 aSig, bSig, zSig0, zSig1;
3788
    int32 expDiff;
3789
    floatx80 z;
3790

    
3791
    aSig = extractFloatx80Frac( a );
3792
    aExp = extractFloatx80Exp( a );
3793
    bSig = extractFloatx80Frac( b );
3794
    bExp = extractFloatx80Exp( b );
3795
    expDiff = aExp - bExp;
3796
    if ( 0 < expDiff ) goto aExpBigger;
3797
    if ( expDiff < 0 ) goto bExpBigger;
3798
    if ( aExp == 0x7FFF ) {
3799
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3800
            return propagateFloatx80NaN( a, b STATUS_VAR );
3801
        }
3802
        float_raise( float_flag_invalid STATUS_VAR);
3803
        z.low = floatx80_default_nan_low;
3804
        z.high = floatx80_default_nan_high;
3805
        return z;
3806
    }
3807
    if ( aExp == 0 ) {
3808
        aExp = 1;
3809
        bExp = 1;
3810
    }
3811
    zSig1 = 0;
3812
    if ( bSig < aSig ) goto aBigger;
3813
    if ( aSig < bSig ) goto bBigger;
3814
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3815
 bExpBigger:
3816
    if ( bExp == 0x7FFF ) {
3817
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3818
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3819
    }
3820
    if ( aExp == 0 ) ++expDiff;
3821
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3822
 bBigger:
3823
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3824
    zExp = bExp;
3825
    zSign ^= 1;
3826
    goto normalizeRoundAndPack;
3827
 aExpBigger:
3828
    if ( aExp == 0x7FFF ) {
3829
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3830
        return a;
3831
    }
3832
    if ( bExp == 0 ) --expDiff;
3833
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3834
 aBigger:
3835
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3836
    zExp = aExp;
3837
 normalizeRoundAndPack:
3838
    return
3839
        normalizeRoundAndPackFloatx80(
3840
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3841

    
3842
}
3843

    
3844
/*----------------------------------------------------------------------------
3845
| Returns the result of adding the extended double-precision floating-point
3846
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3847
| Standard for Binary Floating-Point Arithmetic.
3848
*----------------------------------------------------------------------------*/
3849

    
3850
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3851
{
3852
    flag aSign, bSign;
3853

    
3854
    aSign = extractFloatx80Sign( a );
3855
    bSign = extractFloatx80Sign( b );
3856
    if ( aSign == bSign ) {
3857
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3858
    }
3859
    else {
3860
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3861
    }
3862

    
3863
}
3864

    
3865
/*----------------------------------------------------------------------------
3866
| Returns the result of subtracting the extended double-precision floating-
3867
| point values `a' and `b'.  The operation is performed according to the
3868
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3869
*----------------------------------------------------------------------------*/
3870

    
3871
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3872
{
3873
    flag aSign, bSign;
3874

    
3875
    aSign = extractFloatx80Sign( a );
3876
    bSign = extractFloatx80Sign( b );
3877
    if ( aSign == bSign ) {
3878
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3879
    }
3880
    else {
3881
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3882
    }
3883

    
3884
}
3885

    
3886
/*----------------------------------------------------------------------------
3887
| Returns the result of multiplying the extended double-precision floating-
3888
| point values `a' and `b'.  The operation is performed according to the
3889
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3890
*----------------------------------------------------------------------------*/
3891

    
3892
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3893
{
3894
    flag aSign, bSign, zSign;
3895
    int32 aExp, bExp, zExp;
3896
    bits64 aSig, bSig, zSig0, zSig1;
3897
    floatx80 z;
3898

    
3899
    aSig = extractFloatx80Frac( a );
3900
    aExp = extractFloatx80Exp( a );
3901
    aSign = extractFloatx80Sign( a );
3902
    bSig = extractFloatx80Frac( b );
3903
    bExp = extractFloatx80Exp( b );
3904
    bSign = extractFloatx80Sign( b );
3905
    zSign = aSign ^ bSign;
3906
    if ( aExp == 0x7FFF ) {
3907
        if (    (bits64) ( aSig<<1 )
3908
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3909
            return propagateFloatx80NaN( a, b STATUS_VAR );
3910
        }
3911
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3912
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3913
    }
3914
    if ( bExp == 0x7FFF ) {
3915
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3916
        if ( ( aExp | aSig ) == 0 ) {
3917
 invalid:
3918
            float_raise( float_flag_invalid STATUS_VAR);
3919
            z.low = floatx80_default_nan_low;
3920
            z.high = floatx80_default_nan_high;
3921
            return z;
3922
        }
3923
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3924
    }
3925
    if ( aExp == 0 ) {
3926
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3927
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3928
    }
3929
    if ( bExp == 0 ) {
3930
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3931
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3932
    }
3933
    zExp = aExp + bExp - 0x3FFE;
3934
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3935
    if ( 0 < (sbits64) zSig0 ) {
3936
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3937
        --zExp;
3938
    }
3939
    return
3940
        roundAndPackFloatx80(
3941
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3942

    
3943
}
3944

    
3945
/*----------------------------------------------------------------------------
3946
| Returns the result of dividing the extended double-precision floating-point
3947
| value `a' by the corresponding value `b'.  The operation is performed
3948
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3949
*----------------------------------------------------------------------------*/
3950

    
3951
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3952
{
3953
    flag aSign, bSign, zSign;
3954
    int32 aExp, bExp, zExp;
3955
    bits64 aSig, bSig, zSig0, zSig1;
3956
    bits64 rem0, rem1, rem2, term0, term1, term2;
3957
    floatx80 z;
3958

    
3959
    aSig = extractFloatx80Frac( a );
3960
    aExp = extractFloatx80Exp( a );
3961
    aSign = extractFloatx80Sign( a );
3962
    bSig = extractFloatx80Frac( b );
3963
    bExp = extractFloatx80Exp( b );
3964
    bSign = extractFloatx80Sign( b );
3965
    zSign = aSign ^ bSign;
3966
    if ( aExp == 0x7FFF ) {
3967
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3968
        if ( bExp == 0x7FFF ) {
3969
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3970
            goto invalid;
3971
        }
3972
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3973
    }
3974
    if ( bExp == 0x7FFF ) {
3975
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3976
        return packFloatx80( zSign, 0, 0 );
3977
    }
3978
    if ( bExp == 0 ) {
3979
        if ( bSig == 0 ) {
3980
            if ( ( aExp | aSig ) == 0 ) {
3981
 invalid:
3982
                float_raise( float_flag_invalid STATUS_VAR);
3983
                z.low = floatx80_default_nan_low;
3984
                z.high = floatx80_default_nan_high;
3985
                return z;
3986
            }
3987
            float_raise( float_flag_divbyzero STATUS_VAR);
3988
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3989
        }
3990
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3991
    }
3992
    if ( aExp == 0 ) {
3993
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3994
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3995
    }
3996
    zExp = aExp - bExp + 0x3FFE;
3997
    rem1 = 0;
3998
    if ( bSig <= aSig ) {
3999
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4000
        ++zExp;
4001
    }
4002
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4003
    mul64To128( bSig, zSig0, &term0, &term1 );
4004
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4005
    while ( (sbits64) rem0 < 0 ) {
4006
        --zSig0;
4007
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4008
    }
4009
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4010
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4011
        mul64To128( bSig, zSig1, &term1, &term2 );
4012
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4013
        while ( (sbits64) rem1 < 0 ) {
4014
            --zSig1;
4015
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4016
        }
4017
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4018
    }
4019
    return
4020
        roundAndPackFloatx80(
4021
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4022

    
4023
}
4024

    
4025
/*----------------------------------------------------------------------------
4026
| Returns the remainder of the extended double-precision floating-point value
4027
| `a' with respect to the corresponding value `b'.  The operation is performed
4028
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4029
*----------------------------------------------------------------------------*/
4030

    
4031
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4032
{
4033
    flag aSign, zSign;
4034
    int32 aExp, bExp, expDiff;
4035
    bits64 aSig0, aSig1, bSig;
4036
    bits64 q, term0, term1, alternateASig0, alternateASig1;
4037
    floatx80 z;
4038

    
4039
    aSig0 = extractFloatx80Frac( a );
4040
    aExp = extractFloatx80Exp( a );
4041
    aSign = extractFloatx80Sign( a );
4042
    bSig = extractFloatx80Frac( b );
4043
    bExp = extractFloatx80Exp( b );
4044
    if ( aExp == 0x7FFF ) {
4045
        if (    (bits64) ( aSig0<<1 )
4046
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4047
            return propagateFloatx80NaN( a, b STATUS_VAR );
4048
        }
4049
        goto invalid;
4050
    }
4051
    if ( bExp == 0x7FFF ) {
4052
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053
        return a;
4054
    }
4055
    if ( bExp == 0 ) {
4056
        if ( bSig == 0 ) {
4057
 invalid:
4058
            float_raise( float_flag_invalid STATUS_VAR);
4059
            z.low = floatx80_default_nan_low;
4060
            z.high = floatx80_default_nan_high;
4061
            return z;
4062
        }
4063
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4064
    }
4065
    if ( aExp == 0 ) {
4066
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4067
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4068
    }
4069
    bSig |= LIT64( 0x8000000000000000 );
4070
    zSign = aSign;
4071
    expDiff = aExp - bExp;
4072
    aSig1 = 0;
4073
    if ( expDiff < 0 ) {
4074
        if ( expDiff < -1 ) return a;
4075
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4076
        expDiff = 0;
4077
    }
4078
    q = ( bSig <= aSig0 );
4079
    if ( q ) aSig0 -= bSig;
4080
    expDiff -= 64;
4081
    while ( 0 < expDiff ) {
4082
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4083
        q = ( 2 < q ) ? q - 2 : 0;
4084
        mul64To128( bSig, q, &term0, &term1 );
4085
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4086
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4087
        expDiff -= 62;
4088
    }
4089
    expDiff += 64;
4090
    if ( 0 < expDiff ) {
4091
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4092
        q = ( 2 < q ) ? q - 2 : 0;
4093
        q >>= 64 - expDiff;
4094
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4095
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4096
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4097
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4098
            ++q;
4099
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4100
        }
4101
    }
4102
    else {
4103
        term1 = 0;
4104
        term0 = bSig;
4105
    }
4106
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4107
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4108
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4109
              && ( q & 1 ) )
4110
       ) {
4111
        aSig0 = alternateASig0;
4112
        aSig1 = alternateASig1;
4113
        zSign = ! zSign;
4114
    }
4115
    return
4116
        normalizeRoundAndPackFloatx80(
4117
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4118

    
4119
}
4120

    
4121
/*----------------------------------------------------------------------------
4122
| Returns the square root of the extended double-precision floating-point
4123
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4124
| for Binary Floating-Point Arithmetic.
4125
*----------------------------------------------------------------------------*/
4126

    
4127
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4128
{
4129
    flag aSign;
4130
    int32 aExp, zExp;
4131
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4132
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4133
    floatx80 z;
4134

    
4135
    aSig0 = extractFloatx80Frac( a );
4136
    aExp = extractFloatx80Exp( a );
4137
    aSign = extractFloatx80Sign( a );
4138
    if ( aExp == 0x7FFF ) {
4139
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4140
        if ( ! aSign ) return a;
4141
        goto invalid;
4142
    }
4143
    if ( aSign ) {
4144
        if ( ( aExp | aSig0 ) == 0 ) return a;
4145
 invalid:
4146
        float_raise( float_flag_invalid STATUS_VAR);
4147
        z.low = floatx80_default_nan_low;
4148
        z.high = floatx80_default_nan_high;
4149
        return z;
4150
    }
4151
    if ( aExp == 0 ) {
4152
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4153
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4154
    }
4155
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4156
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4157
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4158
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4159
    doubleZSig0 = zSig0<<1;
4160
    mul64To128( zSig0, zSig0, &term0, &term1 );
4161
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4162
    while ( (sbits64) rem0 < 0 ) {
4163
        --zSig0;
4164
        doubleZSig0 -= 2;
4165
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4166
    }
4167
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4168
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4169
        if ( zSig1 == 0 ) zSig1 = 1;
4170
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4171
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4172
        mul64To128( zSig1, zSig1, &term2, &term3 );
4173
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4174
        while ( (sbits64) rem1 < 0 ) {
4175
            --zSig1;
4176
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4177
            term3 |= 1;
4178
            term2 |= doubleZSig0;
4179
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4180
        }
4181
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4182
    }
4183
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4184
    zSig0 |= doubleZSig0;
4185
    return
4186
        roundAndPackFloatx80(
4187
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4188

    
4189
}
4190

    
4191
/*----------------------------------------------------------------------------
4192
| Returns 1 if the extended double-precision floating-point value `a' is
4193
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4194
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4195
| Arithmetic.
4196
*----------------------------------------------------------------------------*/
4197

    
4198
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4199
{
4200

    
4201
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4202
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4203
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4204
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4205
       ) {
4206
        if (    floatx80_is_signaling_nan( a )
4207
             || floatx80_is_signaling_nan( b ) ) {
4208
            float_raise( float_flag_invalid STATUS_VAR);
4209
        }
4210
        return 0;
4211
    }
4212
    return
4213
           ( a.low == b.low )
4214
        && (    ( a.high == b.high )
4215
             || (    ( a.low == 0 )
4216
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4217
           );
4218

    
4219
}
4220

    
4221
/*----------------------------------------------------------------------------
4222
| Returns 1 if the extended double-precision floating-point value `a' is
4223
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4224
| comparison is performed according to the IEC/IEEE Standard for Binary
4225
| Floating-Point Arithmetic.
4226
*----------------------------------------------------------------------------*/
4227

    
4228
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4229
{
4230
    flag aSign, bSign;
4231

    
4232
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4233
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4234
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4235
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4236
       ) {
4237
        float_raise( float_flag_invalid STATUS_VAR);
4238
        return 0;
4239
    }
4240
    aSign = extractFloatx80Sign( a );
4241
    bSign = extractFloatx80Sign( b );
4242
    if ( aSign != bSign ) {
4243
        return
4244
               aSign
4245
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246
                 == 0 );
4247
    }
4248
    return
4249
          aSign ? le128( b.high, b.low, a.high, a.low )
4250
        : le128( a.high, a.low, b.high, b.low );
4251

    
4252
}
4253

    
4254
/*----------------------------------------------------------------------------
4255
| Returns 1 if the extended double-precision floating-point value `a' is
4256
| less than the corresponding value `b', and 0 otherwise.  The comparison
4257
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258
| Arithmetic.
4259
*----------------------------------------------------------------------------*/
4260

    
4261
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4262
{
4263
    flag aSign, bSign;
4264

    
4265
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4266
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4267
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4268
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4269
       ) {
4270
        float_raise( float_flag_invalid STATUS_VAR);
4271
        return 0;
4272
    }
4273
    aSign = extractFloatx80Sign( a );
4274
    bSign = extractFloatx80Sign( b );
4275
    if ( aSign != bSign ) {
4276
        return
4277
               aSign
4278
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4279
                 != 0 );
4280
    }
4281
    return
4282
          aSign ? lt128( b.high, b.low, a.high, a.low )
4283
        : lt128( a.high, a.low, b.high, b.low );
4284

    
4285
}
4286

    
4287
/*----------------------------------------------------------------------------
4288
| Returns 1 if the extended double-precision floating-point value `a' is equal
4289
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4290
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4291
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292
*----------------------------------------------------------------------------*/
4293

    
4294
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4295
{
4296

    
4297
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4298
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4299
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4300
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4301
       ) {
4302
        float_raise( float_flag_invalid STATUS_VAR);
4303
        return 0;
4304
    }
4305
    return
4306
           ( a.low == b.low )
4307
        && (    ( a.high == b.high )
4308
             || (    ( a.low == 0 )
4309
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4310
           );
4311

    
4312
}
4313

    
4314
/*----------------------------------------------------------------------------
4315
| Returns 1 if the extended double-precision floating-point value `a' is less
4316
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4317
| do not cause an exception.  Otherwise, the comparison is performed according
4318
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4319
*----------------------------------------------------------------------------*/
4320

    
4321
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4322
{
4323
    flag aSign, bSign;
4324

    
4325
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4326
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4327
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4328
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4329
       ) {
4330
        if (    floatx80_is_signaling_nan( a )
4331
             || floatx80_is_signaling_nan( b ) ) {
4332
            float_raise( float_flag_invalid STATUS_VAR);
4333
        }
4334
        return 0;
4335
    }
4336
    aSign = extractFloatx80Sign( a );
4337
    bSign = extractFloatx80Sign( b );
4338
    if ( aSign != bSign ) {
4339
        return
4340
               aSign
4341
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4342
                 == 0 );
4343
    }
4344
    return
4345
          aSign ? le128( b.high, b.low, a.high, a.low )
4346
        : le128( a.high, a.low, b.high, b.low );
4347

    
4348
}
4349

    
4350
/*----------------------------------------------------------------------------
4351
| Returns 1 if the extended double-precision floating-point value `a' is less
4352
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4353
| an exception.  Otherwise, the comparison is performed according to the
4354
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4355
*----------------------------------------------------------------------------*/
4356

    
4357
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4358
{
4359
    flag aSign, bSign;
4360

    
4361
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4362
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4363
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4364
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4365
       ) {
4366
        if (    floatx80_is_signaling_nan( a )
4367
             || floatx80_is_signaling_nan( b ) ) {
4368
            float_raise( float_flag_invalid STATUS_VAR);
4369
        }
4370
        return 0;
4371
    }
4372
    aSign = extractFloatx80Sign( a );
4373
    bSign = extractFloatx80Sign( b );
4374
    if ( aSign != bSign ) {
4375
        return
4376
               aSign
4377
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4378
                 != 0 );
4379
    }
4380
    return
4381
          aSign ? lt128( b.high, b.low, a.high, a.low )
4382
        : lt128( a.high, a.low, b.high, b.low );
4383

    
4384
}
4385

    
4386
#endif
4387

    
4388
#ifdef FLOAT128
4389

    
4390
/*----------------------------------------------------------------------------
4391
| Returns the result of converting the quadruple-precision floating-point
4392
| value `a' to the 32-bit two's complement integer format.  The conversion
4393
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4394
| Arithmetic---which means in particular that the conversion is rounded
4395
| according to the current rounding mode.  If `a' is a NaN, the largest
4396
| positive integer is returned.  Otherwise, if the conversion overflows, the
4397
| largest integer with the same sign as `a' is returned.
4398
*----------------------------------------------------------------------------*/
4399

    
4400
int32 float128_to_int32( float128 a STATUS_PARAM )
4401
{
4402
    flag aSign;
4403
    int32 aExp, shiftCount;
4404
    bits64 aSig0, aSig1;
4405

    
4406
    aSig1 = extractFloat128Frac1( a );
4407
    aSig0 = extractFloat128Frac0( a );
4408
    aExp = extractFloat128Exp( a );
4409
    aSign = extractFloat128Sign( a );
4410
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4411
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4412
    aSig0 |= ( aSig1 != 0 );
4413
    shiftCount = 0x4028 - aExp;
4414
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4415
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4416

    
4417
}
4418

    
4419
/*----------------------------------------------------------------------------
4420
| Returns the result of converting the quadruple-precision floating-point
4421
| value `a' to the 32-bit two's complement integer format.  The conversion
4422
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423
| Arithmetic, except that the conversion is always rounded toward zero.  If
4424
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4425
| conversion overflows, the largest integer with the same sign as `a' is
4426
| returned.
4427
*----------------------------------------------------------------------------*/
4428

    
4429
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4430
{
4431
    flag aSign;
4432
    int32 aExp, shiftCount;
4433
    bits64 aSig0, aSig1, savedASig;
4434
    int32 z;
4435

    
4436
    aSig1 = extractFloat128Frac1( a );
4437
    aSig0 = extractFloat128Frac0( a );
4438
    aExp = extractFloat128Exp( a );
4439
    aSign = extractFloat128Sign( a );
4440
    aSig0 |= ( aSig1 != 0 );
4441
    if ( 0x401E < aExp ) {
4442
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4443
        goto invalid;
4444
    }
4445
    else if ( aExp < 0x3FFF ) {
4446
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4447
        return 0;
4448
    }
4449
    aSig0 |= LIT64( 0x0001000000000000 );
4450
    shiftCount = 0x402F - aExp;
4451
    savedASig = aSig0;
4452
    aSig0 >>= shiftCount;
4453
    z = aSig0;
4454
    if ( aSign ) z = - z;
4455
    if ( ( z < 0 ) ^ aSign ) {
4456
 invalid:
4457
        float_raise( float_flag_invalid STATUS_VAR);
4458
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4459
    }
4460
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4461
        STATUS(float_exception_flags) |= float_flag_inexact;
4462
    }
4463
    return z;
4464

    
4465
}
4466

    
4467
/*----------------------------------------------------------------------------
4468
| Returns the result of converting the quadruple-precision floating-point
4469
| value `a' to the 64-bit two's complement integer format.  The conversion
4470
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471
| Arithmetic---which means in particular that the conversion is rounded
4472
| according to the current rounding mode.  If `a' is a NaN, the largest
4473
| positive integer is returned.  Otherwise, if the conversion overflows, the
4474
| largest integer with the same sign as `a' is returned.
4475
*----------------------------------------------------------------------------*/
4476

    
4477
int64 float128_to_int64( float128 a STATUS_PARAM )
4478
{
4479
    flag aSign;
4480
    int32 aExp, shiftCount;
4481
    bits64 aSig0, aSig1;
4482

    
4483
    aSig1 = extractFloat128Frac1( a );
4484
    aSig0 = extractFloat128Frac0( a );
4485
    aExp = extractFloat128Exp( a );
4486
    aSign = extractFloat128Sign( a );
4487
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4488
    shiftCount = 0x402F - aExp;
4489
    if ( shiftCount <= 0 ) {
4490
        if ( 0x403E < aExp ) {
4491
            float_raise( float_flag_invalid STATUS_VAR);
4492
            if (    ! aSign
4493
                 || (    ( aExp == 0x7FFF )
4494
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4495
                    )
4496
               ) {
4497
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4498
            }
4499
            return (sbits64) LIT64( 0x8000000000000000 );
4500
        }
4501
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4502
    }
4503
    else {
4504
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4505
    }
4506
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4507

    
4508
}
4509

    
4510
/*----------------------------------------------------------------------------
4511
| Returns the result of converting the quadruple-precision floating-point
4512
| value `a' to the 64-bit two's complement integer format.  The conversion
4513
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4514
| Arithmetic, except that the conversion is always rounded toward zero.
4515
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4516
| the conversion overflows, the largest integer with the same sign as `a' is
4517
| returned.
4518
*----------------------------------------------------------------------------*/
4519

    
4520
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4521
{
4522
    flag aSign;
4523
    int32 aExp, shiftCount;
4524
    bits64 aSig0, aSig1;
4525
    int64 z;
4526

    
4527
    aSig1 = extractFloat128Frac1( a );
4528
    aSig0 = extractFloat128Frac0( a );
4529
    aExp = extractFloat128Exp( a );
4530
    aSign = extractFloat128Sign( a );
4531
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4532
    shiftCount = aExp - 0x402F;
4533
    if ( 0 < shiftCount ) {
4534
        if ( 0x403E <= aExp ) {
4535
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4536
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4537
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4538
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4539
            }
4540
            else {
4541
                float_raise( float_flag_invalid STATUS_VAR);
4542
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4543
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4544
                }
4545
            }
4546
            return (sbits64) LIT64( 0x8000000000000000 );
4547
        }
4548
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4549
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4550
            STATUS(float_exception_flags) |= float_flag_inexact;
4551
        }
4552
    }
4553
    else {
4554
        if ( aExp < 0x3FFF ) {
4555
            if ( aExp | aSig0 | aSig1 ) {
4556
                STATUS(float_exception_flags) |= float_flag_inexact;
4557
            }
4558
            return 0;
4559
        }
4560
        z = aSig0>>( - shiftCount );
4561
        if (    aSig1
4562
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4563
            STATUS(float_exception_flags) |= float_flag_inexact;
4564
        }
4565
    }
4566
    if ( aSign ) z = - z;
4567
    return z;
4568

    
4569
}
4570

    
4571
/*----------------------------------------------------------------------------
4572
| Returns the result of converting the quadruple-precision floating-point
4573
| value `a' to the single-precision floating-point format.  The conversion
4574
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4575
| Arithmetic.
4576
*----------------------------------------------------------------------------*/
4577

    
4578
float32 float128_to_float32( float128 a STATUS_PARAM )
4579
{
4580
    flag aSign;
4581
    int32 aExp;
4582
    bits64 aSig0, aSig1;
4583
    bits32 zSig;
4584

    
4585
    aSig1 = extractFloat128Frac1( a );
4586
    aSig0 = extractFloat128Frac0( a );
4587
    aExp = extractFloat128Exp( a );
4588
    aSign = extractFloat128Sign( a );
4589
    if ( aExp == 0x7FFF ) {
4590
        if ( aSig0 | aSig1 ) {
4591
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4592
        }
4593
        return packFloat32( aSign, 0xFF, 0 );
4594
    }
4595
    aSig0 |= ( aSig1 != 0 );
4596
    shift64RightJamming( aSig0, 18, &aSig0 );
4597
    zSig = aSig0;
4598
    if ( aExp || zSig ) {
4599
        zSig |= 0x40000000;
4600
        aExp -= 0x3F81;
4601
    }
4602
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4603

    
4604
}
4605

    
4606
/*----------------------------------------------------------------------------
4607
| Returns the result of converting the quadruple-precision floating-point
4608
| value `a' to the double-precision floating-point format.  The conversion
4609
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4610
| Arithmetic.
4611
*----------------------------------------------------------------------------*/
4612

    
4613
float64 float128_to_float64( float128 a STATUS_PARAM )
4614
{
4615
    flag aSign;
4616
    int32 aExp;
4617
    bits64 aSig0, aSig1;
4618

    
4619
    aSig1 = extractFloat128Frac1( a );
4620
    aSig0 = extractFloat128Frac0( a );
4621
    aExp = extractFloat128Exp( a );
4622
    aSign = extractFloat128Sign( a );
4623
    if ( aExp == 0x7FFF ) {
4624
        if ( aSig0 | aSig1 ) {
4625
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4626
        }
4627
        return packFloat64( aSign, 0x7FF, 0 );
4628
    }
4629
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4630
    aSig0 |= ( aSig1 != 0 );
4631
    if ( aExp || aSig0 ) {
4632
        aSig0 |= LIT64( 0x4000000000000000 );
4633
        aExp -= 0x3C01;
4634
    }
4635
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4636

    
4637
}
4638

    
4639
#ifdef FLOATX80
4640

    
4641
/*----------------------------------------------------------------------------
4642
| Returns the result of converting the quadruple-precision floating-point
4643
| value `a' to the extended double-precision floating-point format.  The
4644
| conversion is performed according to the IEC/IEEE Standard for Binary
4645
| Floating-Point Arithmetic.
4646
*----------------------------------------------------------------------------*/
4647

    
4648
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4649
{
4650
    flag aSign;
4651
    int32 aExp;
4652
    bits64 aSig0, aSig1;
4653

    
4654
    aSig1 = extractFloat128Frac1( a );
4655
    aSig0 = extractFloat128Frac0( a );
4656
    aExp = extractFloat128Exp( a );
4657
    aSign = extractFloat128Sign( a );
4658
    if ( aExp == 0x7FFF ) {
4659
        if ( aSig0 | aSig1 ) {
4660
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4661
        }
4662
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4663
    }
4664
    if ( aExp == 0 ) {
4665
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4666
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4667
    }
4668
    else {
4669
        aSig0 |= LIT64( 0x0001000000000000 );
4670
    }
4671
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4672
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4673

    
4674
}
4675

    
4676
#endif
4677

    
4678
/*----------------------------------------------------------------------------
4679
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4680
| returns the result as a quadruple-precision floating-point value.  The
4681
| operation is performed according to the IEC/IEEE Standard for Binary
4682
| Floating-Point Arithmetic.
4683
*----------------------------------------------------------------------------*/
4684

    
4685
float128 float128_round_to_int( float128 a STATUS_PARAM )
4686
{
4687
    flag aSign;
4688
    int32 aExp;
4689
    bits64 lastBitMask, roundBitsMask;
4690
    int8 roundingMode;
4691
    float128 z;
4692

    
4693
    aExp = extractFloat128Exp( a );
4694
    if ( 0x402F <= aExp ) {
4695
        if ( 0x406F <= aExp ) {
4696
            if (    ( aExp == 0x7FFF )
4697
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4698
               ) {
4699
                return propagateFloat128NaN( a, a STATUS_VAR );
4700
            }
4701
            return a;
4702
        }
4703
        lastBitMask = 1;
4704
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4705
        roundBitsMask = lastBitMask - 1;
4706
        z = a;
4707
        roundingMode = STATUS(float_rounding_mode);
4708
        if ( roundingMode == float_round_nearest_even ) {
4709
            if ( lastBitMask ) {
4710
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4711
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4712
            }
4713
            else {
4714
                if ( (sbits64) z.low < 0 ) {
4715
                    ++z.high;
4716
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4717
                }
4718
            }
4719
        }
4720
        else if ( roundingMode != float_round_to_zero ) {
4721
            if (   extractFloat128Sign( z )
4722
                 ^ ( roundingMode == float_round_up ) ) {
4723
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4724
            }
4725
        }
4726
        z.low &= ~ roundBitsMask;
4727
    }
4728
    else {
4729
        if ( aExp < 0x3FFF ) {
4730
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4731
            STATUS(float_exception_flags) |= float_flag_inexact;
4732
            aSign = extractFloat128Sign( a );
4733
            switch ( STATUS(float_rounding_mode) ) {
4734
             case float_round_nearest_even:
4735
                if (    ( aExp == 0x3FFE )
4736
                     && (   extractFloat128Frac0( a )
4737
                          | extractFloat128Frac1( a ) )
4738
                   ) {
4739
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4740
                }
4741
                break;
4742
             case float_round_down:
4743
                return
4744
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4745
                    : packFloat128( 0, 0, 0, 0 );
4746
             case float_round_up:
4747
                return
4748
                      aSign ? packFloat128( 1, 0, 0, 0 )
4749
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4750
            }
4751
            return packFloat128( aSign, 0, 0, 0 );
4752
        }
4753
        lastBitMask = 1;
4754
        lastBitMask <<= 0x402F - aExp;
4755
        roundBitsMask = lastBitMask - 1;
4756
        z.low = 0;
4757
        z.high = a.high;
4758
        roundingMode = STATUS(float_rounding_mode);
4759
        if ( roundingMode == float_round_nearest_even ) {
4760
            z.high += lastBitMask>>1;
4761
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4762
                z.high &= ~ lastBitMask;
4763
            }
4764
        }
4765
        else if ( roundingMode != float_round_to_zero ) {
4766
            if (   extractFloat128Sign( z )
4767
                 ^ ( roundingMode == float_round_up ) ) {
4768
                z.high |= ( a.low != 0 );
4769
                z.high += roundBitsMask;
4770
            }
4771
        }
4772
        z.high &= ~ roundBitsMask;
4773
    }
4774
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4775
        STATUS(float_exception_flags) |= float_flag_inexact;
4776
    }
4777
    return z;
4778

    
4779
}
4780

    
4781
/*----------------------------------------------------------------------------
4782
| Returns the result of adding the absolute values of the quadruple-precision
4783
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4784
| before being returned.  `zSign' is ignored if the result is a NaN.
4785
| The addition is performed according to the IEC/IEEE Standard for Binary
4786
| Floating-Point Arithmetic.
4787
*----------------------------------------------------------------------------*/
4788

    
4789
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4790
{
4791
    int32 aExp, bExp, zExp;
4792
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4793
    int32 expDiff;
4794

    
4795
    aSig1 = extractFloat128Frac1( a );
4796
    aSig0 = extractFloat128Frac0( a );
4797
    aExp = extractFloat128Exp( a );
4798
    bSig1 = extractFloat128Frac1( b );
4799
    bSig0 = extractFloat128Frac0( b );
4800
    bExp = extractFloat128Exp( b );
4801
    expDiff = aExp - bExp;
4802
    if ( 0 < expDiff ) {
4803
        if ( aExp == 0x7FFF ) {
4804
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4805
            return a;
4806
        }
4807
        if ( bExp == 0 ) {
4808
            --expDiff;
4809
        }
4810
        else {
4811
            bSig0 |= LIT64( 0x0001000000000000 );
4812
        }
4813
        shift128ExtraRightJamming(
4814
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4815
        zExp = aExp;
4816
    }
4817
    else if ( expDiff < 0 ) {
4818
        if ( bExp == 0x7FFF ) {
4819
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4820
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4821
        }
4822
        if ( aExp == 0 ) {
4823
            ++expDiff;
4824
        }
4825
        else {
4826
            aSig0 |= LIT64( 0x0001000000000000 );
4827
        }
4828
        shift128ExtraRightJamming(
4829
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4830
        zExp = bExp;
4831
    }
4832
    else {
4833
        if ( aExp == 0x7FFF ) {
4834
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4835
                return propagateFloat128NaN( a, b STATUS_VAR );
4836
            }
4837
            return a;
4838
        }
4839
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4840
        if ( aExp == 0 ) {
4841
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4842
            return packFloat128( zSign, 0, zSig0, zSig1 );
4843
        }
4844
        zSig2 = 0;
4845
        zSig0 |= LIT64( 0x0002000000000000 );
4846
        zExp = aExp;
4847
        goto shiftRight1;
4848
    }
4849
    aSig0 |= LIT64( 0x0001000000000000 );
4850
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4851
    --zExp;
4852
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4853
    ++zExp;
4854
 shiftRight1:
4855
    shift128ExtraRightJamming(
4856
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4857
 roundAndPack:
4858
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4859

    
4860
}
4861

    
4862
/*----------------------------------------------------------------------------
4863
| Returns the result of subtracting the absolute values of the quadruple-
4864
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4865
| difference is negated before being returned.  `zSign' is ignored if the
4866
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4867
| Standard for Binary Floating-Point Arithmetic.
4868
*----------------------------------------------------------------------------*/
4869

    
4870
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4871
{
4872
    int32 aExp, bExp, zExp;
4873
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4874
    int32 expDiff;
4875
    float128 z;
4876

    
4877
    aSig1 = extractFloat128Frac1( a );
4878
    aSig0 = extractFloat128Frac0( a );
4879
    aExp = extractFloat128Exp( a );
4880
    bSig1 = extractFloat128Frac1( b );
4881
    bSig0 = extractFloat128Frac0( b );
4882
    bExp = extractFloat128Exp( b );
4883
    expDiff = aExp - bExp;
4884
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4885
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4886
    if ( 0 < expDiff ) goto aExpBigger;
4887
    if ( expDiff < 0 ) goto bExpBigger;
4888
    if ( aExp == 0x7FFF ) {
4889
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4890
            return propagateFloat128NaN( a, b STATUS_VAR );
4891
        }
4892
        float_raise( float_flag_invalid STATUS_VAR);
4893
        z.low = float128_default_nan_low;
4894
        z.high = float128_default_nan_high;
4895
        return z;
4896
    }
4897
    if ( aExp == 0 ) {
4898
        aExp = 1;
4899
        bExp = 1;
4900
    }
4901
    if ( bSig0 < aSig0 ) goto aBigger;
4902
    if ( aSig0 < bSig0 ) goto bBigger;
4903
    if ( bSig1 < aSig1 ) goto aBigger;
4904
    if ( aSig1 < bSig1 ) goto bBigger;
4905
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4906
 bExpBigger:
4907
    if ( bExp == 0x7FFF ) {
4908
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4909
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4910
    }
4911
    if ( aExp == 0 ) {
4912
        ++expDiff;
4913
    }
4914
    else {
4915
        aSig0 |= LIT64( 0x4000000000000000 );
4916
    }
4917
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4918
    bSig0 |= LIT64( 0x4000000000000000 );
4919
 bBigger:
4920
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4921
    zExp = bExp;
4922
    zSign ^= 1;
4923
    goto normalizeRoundAndPack;
4924
 aExpBigger:
4925
    if ( aExp == 0x7FFF ) {
4926
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4927
        return a;
4928
    }
4929
    if ( bExp == 0 ) {
4930
        --expDiff;
4931
    }
4932
    else {
4933
        bSig0 |= LIT64( 0x4000000000000000 );
4934
    }
4935
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4936
    aSig0 |= LIT64( 0x4000000000000000 );
4937
 aBigger:
4938
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4939
    zExp = aExp;
4940
 normalizeRoundAndPack:
4941
    --zExp;
4942
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4943

    
4944
}
4945

    
4946
/*----------------------------------------------------------------------------
4947
| Returns the result of adding the quadruple-precision floating-point values
4948
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4949
| for Binary Floating-Point Arithmetic.
4950
*----------------------------------------------------------------------------*/
4951

    
4952
float128 float128_add( float128 a, float128 b STATUS_PARAM )
4953
{
4954
    flag aSign, bSign;
4955

    
4956
    aSign = extractFloat128Sign( a );
4957
    bSign = extractFloat128Sign( b );
4958
    if ( aSign == bSign ) {
4959
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4960
    }
4961
    else {
4962
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4963
    }
4964

    
4965
}
4966

    
4967
/*----------------------------------------------------------------------------
4968
| Returns the result of subtracting the quadruple-precision floating-point
4969
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4970
| Standard for Binary Floating-Point Arithmetic.
4971
*----------------------------------------------------------------------------*/
4972

    
4973
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4974
{
4975
    flag aSign, bSign;
4976

    
4977
    aSign = extractFloat128Sign( a );
4978
    bSign = extractFloat128Sign( b );
4979
    if ( aSign == bSign ) {
4980
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4981
    }
4982
    else {
4983
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4984
    }
4985

    
4986
}
4987

    
4988
/*----------------------------------------------------------------------------
4989
| Returns the result of multiplying the quadruple-precision floating-point
4990
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4991
| Standard for Binary Floating-Point Arithmetic.
4992
*----------------------------------------------------------------------------*/
4993

    
4994
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4995
{
4996
    flag aSign, bSign, zSign;
4997
    int32 aExp, bExp, zExp;
4998
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4999
    float128 z;
5000

    
5001
    aSig1 = extractFloat128Frac1( a );
5002
    aSig0 = extractFloat128Frac0( a );
5003
    aExp = extractFloat128Exp( a );
5004
    aSign = extractFloat128Sign( a );
5005
    bSig1 = extractFloat128Frac1( b );
5006
    bSig0 = extractFloat128Frac0( b );
5007
    bExp = extractFloat128Exp( b );
5008
    bSign = extractFloat128Sign( b );
5009
    zSign = aSign ^ bSign;
5010
    if ( aExp == 0x7FFF ) {
5011
        if (    ( aSig0 | aSig1 )
5012
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5013
            return propagateFloat128NaN( a, b STATUS_VAR );
5014
        }
5015
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5016
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5017
    }
5018
    if ( bExp == 0x7FFF ) {
5019
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5020
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5021
 invalid:
5022
            float_raise( float_flag_invalid STATUS_VAR);
5023
            z.low = float128_default_nan_low;
5024
            z.high = float128_default_nan_high;
5025
            return z;
5026
        }
5027
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5028
    }
5029
    if ( aExp == 0 ) {
5030
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5031
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5032
    }
5033
    if ( bExp == 0 ) {
5034
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5035
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5036
    }
5037
    zExp = aExp + bExp - 0x4000;
5038
    aSig0 |= LIT64( 0x0001000000000000 );
5039
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5040
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5041
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5042
    zSig2 |= ( zSig3 != 0 );
5043
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5044
        shift128ExtraRightJamming(
5045
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5046
        ++zExp;
5047
    }
5048
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5049

    
5050
}
5051

    
5052
/*----------------------------------------------------------------------------
5053
| Returns the result of dividing the quadruple-precision floating-point value
5054
| `a' by the corresponding value `b'.  The operation is performed according to
5055
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056
*----------------------------------------------------------------------------*/
5057

    
5058
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5059
{
5060
    flag aSign, bSign, zSign;
5061
    int32 aExp, bExp, zExp;
5062
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5063
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5064
    float128 z;
5065

    
5066
    aSig1 = extractFloat128Frac1( a );
5067
    aSig0 = extractFloat128Frac0( a );
5068
    aExp = extractFloat128Exp( a );
5069
    aSign = extractFloat128Sign( a );
5070
    bSig1 = extractFloat128Frac1( b );
5071
    bSig0 = extractFloat128Frac0( b );
5072
    bExp = extractFloat128Exp( b );
5073
    bSign = extractFloat128Sign( b );
5074
    zSign = aSign ^ bSign;
5075
    if ( aExp == 0x7FFF ) {
5076
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5077
        if ( bExp == 0x7FFF ) {
5078
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5079
            goto invalid;
5080
        }
5081
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5082
    }
5083
    if ( bExp == 0x7FFF ) {
5084
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5085
        return packFloat128( zSign, 0, 0, 0 );
5086
    }
5087
    if ( bExp == 0 ) {
5088
        if ( ( bSig0 | bSig1 ) == 0 ) {
5089
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5090
 invalid:
5091
                float_raise( float_flag_invalid STATUS_VAR);
5092
                z.low = float128_default_nan_low;
5093
                z.high = float128_default_nan_high;
5094
                return z;
5095
            }
5096
            float_raise( float_flag_divbyzero STATUS_VAR);
5097
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5098
        }
5099
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5100
    }
5101
    if ( aExp == 0 ) {
5102
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5103
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5104
    }
5105
    zExp = aExp - bExp + 0x3FFD;
5106
    shortShift128Left(
5107
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5108
    shortShift128Left(
5109
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5110
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5111
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5112
        ++zExp;
5113
    }
5114
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5115
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5116
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5117
    while ( (sbits64) rem0 < 0 ) {
5118
        --zSig0;
5119
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5120
    }
5121
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5122
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5123
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5124
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5125
        while ( (sbits64) rem1 < 0 ) {
5126
            --zSig1;
5127
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5128
        }
5129
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5130
    }
5131
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5132
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5133

    
5134
}
5135

    
5136
/*----------------------------------------------------------------------------
5137
| Returns the remainder of the quadruple-precision floating-point value `a'
5138
| with respect to the corresponding value `b'.  The operation is performed
5139
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5140
*----------------------------------------------------------------------------*/
5141

    
5142
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5143
{
5144
    flag aSign, zSign;
5145
    int32 aExp, bExp, expDiff;
5146
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5147
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5148
    sbits64 sigMean0;
5149
    float128 z;
5150

    
5151
    aSig1 = extractFloat128Frac1( a );
5152
    aSig0 = extractFloat128Frac0( a );
5153
    aExp = extractFloat128Exp( a );
5154
    aSign = extractFloat128Sign( a );
5155
    bSig1 = extractFloat128Frac1( b );
5156
    bSig0 = extractFloat128Frac0( b );
5157
    bExp = extractFloat128Exp( b );
5158
    if ( aExp == 0x7FFF ) {
5159
        if (    ( aSig0 | aSig1 )
5160
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5161
            return propagateFloat128NaN( a, b STATUS_VAR );
5162
        }
5163
        goto invalid;
5164
    }
5165
    if ( bExp == 0x7FFF ) {
5166
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5167
        return a;
5168
    }
5169
    if ( bExp == 0 ) {
5170
        if ( ( bSig0 | bSig1 ) == 0 ) {
5171
 invalid:
5172
            float_raise( float_flag_invalid STATUS_VAR);
5173
            z.low = float128_default_nan_low;
5174
            z.high = float128_default_nan_high;
5175
            return z;
5176
        }
5177
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5178
    }
5179
    if ( aExp == 0 ) {
5180
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5181
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5182
    }
5183
    expDiff = aExp - bExp;
5184
    if ( expDiff < -1 ) return a;
5185
    shortShift128Left(
5186
        aSig0 | LIT64( 0x0001000000000000 ),
5187
        aSig1,
5188
        15 - ( expDiff < 0 ),
5189
        &aSig0,
5190
        &aSig1
5191
    );
5192
    shortShift128Left(
5193
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5194
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5195
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5196
    expDiff -= 64;
5197
    while ( 0 < expDiff ) {
5198
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5199
        q = ( 4 < q ) ? q - 4 : 0;
5200
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5201
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5202
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5203
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5204
        expDiff -= 61;
5205
    }
5206
    if ( -64 < expDiff ) {
5207
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5208
        q = ( 4 < q ) ? q - 4 : 0;
5209
        q >>= - expDiff;
5210
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5211
        expDiff += 52;
5212
        if ( expDiff < 0 ) {
5213
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5214
        }
5215
        else {
5216
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5217
        }
5218
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5219
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5220
    }
5221
    else {
5222
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5223
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5224
    }
5225
    do {
5226
        alternateASig0 = aSig0;
5227
        alternateASig1 = aSig1;
5228
        ++q;
5229
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5230
    } while ( 0 <= (sbits64) aSig0 );
5231
    add128(
5232
        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5233
    if (    ( sigMean0 < 0 )
5234
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5235
        aSig0 = alternateASig0;
5236
        aSig1 = alternateASig1;
5237
    }
5238
    zSign = ( (sbits64) aSig0 < 0 );
5239
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5240
    return
5241
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5242

    
5243
}
5244

    
5245
/*----------------------------------------------------------------------------
5246
| Returns the square root of the quadruple-precision floating-point value `a'.
5247
| The operation is performed according to the IEC/IEEE Standard for Binary
5248
| Floating-Point Arithmetic.
5249
*----------------------------------------------------------------------------*/
5250

    
5251
float128 float128_sqrt( float128 a STATUS_PARAM )
5252
{
5253
    flag aSign;
5254
    int32 aExp, zExp;
5255
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5256
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5257
    float128 z;
5258

    
5259
    aSig1 = extractFloat128Frac1( a );
5260
    aSig0 = extractFloat128Frac0( a );
5261
    aExp = extractFloat128Exp( a );
5262
    aSign = extractFloat128Sign( a );
5263
    if ( aExp == 0x7FFF ) {
5264
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5265
        if ( ! aSign ) return a;
5266
        goto invalid;
5267
    }
5268
    if ( aSign ) {
5269
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5270
 invalid:
5271
        float_raise( float_flag_invalid STATUS_VAR);
5272
        z.low = float128_default_nan_low;
5273
        z.high = float128_default_nan_high;
5274
        return z;
5275
    }
5276
    if ( aExp == 0 ) {
5277
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5278
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5279
    }
5280
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5281
    aSig0 |= LIT64( 0x0001000000000000 );
5282
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5283
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5284
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5285
    doubleZSig0 = zSig0<<1;
5286
    mul64To128( zSig0, zSig0, &term0, &term1 );
5287
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5288
    while ( (sbits64) rem0 < 0 ) {
5289
        --zSig0;
5290
        doubleZSig0 -= 2;
5291
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5292
    }
5293
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5294
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5295
        if ( zSig1 == 0 ) zSig1 = 1;
5296
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5297
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5298
        mul64To128( zSig1, zSig1, &term2, &term3 );
5299
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5300
        while ( (sbits64) rem1 < 0 ) {
5301
            --zSig1;
5302
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5303
            term3 |= 1;
5304
            term2 |= doubleZSig0;
5305
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5306
        }
5307
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5308
    }
5309
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5310
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5311

    
5312
}
5313

    
5314
/*----------------------------------------------------------------------------
5315
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5316
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5317
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5318
*----------------------------------------------------------------------------*/
5319

    
5320
int float128_eq( float128 a, float128 b STATUS_PARAM )
5321
{
5322

    
5323
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5324
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5325
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5326
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5327
       ) {
5328
        if (    float128_is_signaling_nan( a )
5329
             || float128_is_signaling_nan( b ) ) {
5330
            float_raise( float_flag_invalid STATUS_VAR);
5331
        }
5332
        return 0;
5333
    }
5334
    return
5335
           ( a.low == b.low )
5336
        && (    ( a.high == b.high )
5337
             || (    ( a.low == 0 )
5338
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5339
           );
5340

    
5341
}
5342

    
5343
/*----------------------------------------------------------------------------
5344
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5345
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5346
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5347
| Arithmetic.
5348
*----------------------------------------------------------------------------*/
5349

    
5350
int float128_le( float128 a, float128 b STATUS_PARAM )
5351
{
5352
    flag aSign, bSign;
5353

    
5354
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5355
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5356
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5357
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5358
       ) {
5359
        float_raise( float_flag_invalid STATUS_VAR);
5360
        return 0;
5361
    }
5362
    aSign = extractFloat128Sign( a );
5363
    bSign = extractFloat128Sign( b );
5364
    if ( aSign != bSign ) {
5365
        return
5366
               aSign
5367
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5368
                 == 0 );
5369
    }
5370
    return
5371
          aSign ? le128( b.high, b.low, a.high, a.low )
5372
        : le128( a.high, a.low, b.high, b.low );
5373

    
5374
}
5375

    
5376
/*----------------------------------------------------------------------------
5377
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5378
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5379
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5380
*----------------------------------------------------------------------------*/
5381

    
5382
int float128_lt( float128 a, float128 b STATUS_PARAM )
5383
{
5384
    flag aSign, bSign;
5385

    
5386
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5387
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5388
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5389
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5390
       ) {
5391
        float_raise( float_flag_invalid STATUS_VAR);
5392
        return 0;
5393
    }
5394
    aSign = extractFloat128Sign( a );
5395
    bSign = extractFloat128Sign( b );
5396
    if ( aSign != bSign ) {
5397
        return
5398
               aSign
5399
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5400
                 != 0 );
5401
    }
5402
    return
5403
          aSign ? lt128( b.high, b.low, a.high, a.low )
5404
        : lt128( a.high, a.low, b.high, b.low );
5405

    
5406
}
5407

    
5408
/*----------------------------------------------------------------------------
5409
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5410
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5411
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5412
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413
*----------------------------------------------------------------------------*/
5414

    
5415
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5416
{
5417

    
5418
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5419
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5420
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5421
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5422
       ) {
5423
        float_raise( float_flag_invalid STATUS_VAR);
5424
        return 0;
5425
    }
5426
    return
5427
           ( a.low == b.low )
5428
        && (    ( a.high == b.high )
5429
             || (    ( a.low == 0 )
5430
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5431
           );
5432

    
5433
}
5434

    
5435
/*----------------------------------------------------------------------------
5436
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5437
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5438
| cause an exception.  Otherwise, the comparison is performed according to the
5439
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5440
*----------------------------------------------------------------------------*/
5441

    
5442
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5443
{
5444
    flag aSign, bSign;
5445

    
5446
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5447
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5448
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5449
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5450
       ) {
5451
        if (    float128_is_signaling_nan( a )
5452
             || float128_is_signaling_nan( b ) ) {
5453
            float_raise( float_flag_invalid STATUS_VAR);
5454
        }
5455
        return 0;
5456
    }
5457
    aSign = extractFloat128Sign( a );
5458
    bSign = extractFloat128Sign( b );
5459
    if ( aSign != bSign ) {
5460
        return
5461
               aSign
5462
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5463
                 == 0 );
5464
    }
5465
    return
5466
          aSign ? le128( b.high, b.low, a.high, a.low )
5467
        : le128( a.high, a.low, b.high, b.low );
5468

    
5469
}
5470

    
5471
/*----------------------------------------------------------------------------
5472
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5473
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5474
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5475
| Standard for Binary Floating-Point Arithmetic.
5476
*----------------------------------------------------------------------------*/
5477

    
5478
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5479
{
5480
    flag aSign, bSign;
5481

    
5482
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5483
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5484
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5485
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5486
       ) {
5487
        if (    float128_is_signaling_nan( a )
5488
             || float128_is_signaling_nan( b ) ) {
5489
            float_raise( float_flag_invalid STATUS_VAR);
5490
        }
5491
        return 0;
5492
    }
5493
    aSign = extractFloat128Sign( a );
5494
    bSign = extractFloat128Sign( b );
5495
    if ( aSign != bSign ) {
5496
        return
5497
               aSign
5498
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5499
                 != 0 );
5500
    }
5501
    return
5502
          aSign ? lt128( b.high, b.low, a.high, a.low )
5503
        : lt128( a.high, a.low, b.high, b.low );
5504

    
5505
}
5506

    
5507
#endif
5508

    
5509
/* misc functions */
5510
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5511
{
5512
    return int64_to_float32(a STATUS_VAR);
5513
}
5514

    
5515
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5516
{
5517
    return int64_to_float64(a STATUS_VAR);
5518
}
5519

    
5520
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5521
{
5522
    int64_t v;
5523
    unsigned int res;
5524

    
5525
    v = float32_to_int64(a STATUS_VAR);
5526
    if (v < 0) {
5527
        res = 0;
5528
        float_raise( float_flag_invalid STATUS_VAR);
5529
    } else if (v > 0xffffffff) {
5530
        res = 0xffffffff;
5531
        float_raise( float_flag_invalid STATUS_VAR);
5532
    } else {
5533
        res = v;
5534
    }
5535
    return res;
5536
}
5537

    
5538
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5539
{
5540
    int64_t v;
5541
    unsigned int res;
5542

    
5543
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5544
    if (v < 0) {
5545
        res = 0;
5546
        float_raise( float_flag_invalid STATUS_VAR);
5547
    } else if (v > 0xffffffff) {
5548
        res = 0xffffffff;
5549
        float_raise( float_flag_invalid STATUS_VAR);
5550
    } else {
5551
        res = v;
5552
    }
5553
    return res;
5554
}
5555

    
5556
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5557
{
5558
    int64_t v;
5559
    unsigned int res;
5560

    
5561
    v = float64_to_int64(a STATUS_VAR);
5562
    if (v < 0) {
5563
        res = 0;
5564
        float_raise( float_flag_invalid STATUS_VAR);
5565
    } else if (v > 0xffffffff) {
5566
        res = 0xffffffff;
5567
        float_raise( float_flag_invalid STATUS_VAR);
5568
    } else {
5569
        res = v;
5570
    }
5571
    return res;
5572
}
5573

    
5574
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5575
{
5576
    int64_t v;
5577
    unsigned int res;
5578

    
5579
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5580
    if (v < 0) {
5581
        res = 0;
5582
        float_raise( float_flag_invalid STATUS_VAR);
5583
    } else if (v > 0xffffffff) {
5584
        res = 0xffffffff;
5585
        float_raise( float_flag_invalid STATUS_VAR);
5586
    } else {
5587
        res = v;
5588
    }
5589
    return res;
5590
}
5591

    
5592
/* FIXME: This looks broken.  */
5593
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5594
{
5595
    int64_t v;
5596

    
5597
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5598
    v += float64_val(a);
5599
    v = float64_to_int64(make_float64(v) STATUS_VAR);
5600

    
5601
    return v - INT64_MIN;
5602
}
5603

    
5604
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5605
{
5606
    int64_t v;
5607

    
5608
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5609
    v += float64_val(a);
5610
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5611

    
5612
    return v - INT64_MIN;
5613
}
5614

    
5615
#define COMPARE(s, nan_exp)                                                  \
5616
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5617
                                      int is_quiet STATUS_PARAM )            \
5618
{                                                                            \
5619
    flag aSign, bSign;                                                       \
5620
    bits ## s av, bv;                                                        \
5621
                                                                             \
5622
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5623
         extractFloat ## s ## Frac( a ) ) ||                                 \
5624
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5625
          extractFloat ## s ## Frac( b ) )) {                                \
5626
        if (!is_quiet ||                                                     \
5627
            float ## s ## _is_signaling_nan( a ) ||                          \
5628
            float ## s ## _is_signaling_nan( b ) ) {                         \
5629
            float_raise( float_flag_invalid STATUS_VAR);                     \
5630
        }                                                                    \
5631
        return float_relation_unordered;                                     \
5632
    }                                                                        \
5633
    aSign = extractFloat ## s ## Sign( a );                                  \
5634
    bSign = extractFloat ## s ## Sign( b );                                  \
5635
    av = float ## s ## _val(a);                                              \
5636
    bv = float ## s ## _val(b);                                              \
5637
    if ( aSign != bSign ) {                                                  \
5638
        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5639
            /* zero case */                                                  \
5640
            return float_relation_equal;                                     \
5641
        } else {                                                             \
5642
            return 1 - (2 * aSign);                                          \
5643
        }                                                                    \
5644
    } else {                                                                 \
5645
        if (av == bv) {                                                      \
5646
            return float_relation_equal;                                     \
5647
        } else {                                                             \
5648
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5649
        }                                                                    \
5650
    }                                                                        \
5651
}                                                                            \
5652
                                                                             \
5653
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5654
{                                                                            \
5655
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5656
}                                                                            \
5657
                                                                             \
5658
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5659
{                                                                            \
5660
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5661
}
5662

    
5663
COMPARE(32, 0xff)
5664
COMPARE(64, 0x7ff)
5665

    
5666
INLINE int float128_compare_internal( float128 a, float128 b,
5667
                                      int is_quiet STATUS_PARAM )
5668
{
5669
    flag aSign, bSign;
5670

    
5671
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5672
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5673
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5674
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5675
        if (!is_quiet ||
5676
            float128_is_signaling_nan( a ) ||
5677
            float128_is_signaling_nan( b ) ) {
5678
            float_raise( float_flag_invalid STATUS_VAR);
5679
        }
5680
        return float_relation_unordered;
5681
    }
5682
    aSign = extractFloat128Sign( a );
5683
    bSign = extractFloat128Sign( b );
5684
    if ( aSign != bSign ) {
5685
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5686
            /* zero case */
5687
            return float_relation_equal;
5688
        } else {
5689
            return 1 - (2 * aSign);
5690
        }
5691
    } else {
5692
        if (a.low == b.low && a.high == b.high) {
5693
            return float_relation_equal;
5694
        } else {
5695
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5696
        }
5697
    }
5698
}
5699

    
5700
int float128_compare( float128 a, float128 b STATUS_PARAM )
5701
{
5702
    return float128_compare_internal(a, b, 0 STATUS_VAR);
5703
}
5704

    
5705
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5706
{
5707
    return float128_compare_internal(a, b, 1 STATUS_VAR);
5708
}
5709

    
5710
/* Multiply A by 2 raised to the power N.  */
5711
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5712
{
5713
    flag aSign;
5714
    int16 aExp;
5715
    bits32 aSig;
5716

    
5717
    aSig = extractFloat32Frac( a );
5718
    aExp = extractFloat32Exp( a );
5719
    aSign = extractFloat32Sign( a );
5720

    
5721
    if ( aExp == 0xFF ) {
5722
        return a;
5723
    }
5724
    if ( aExp != 0 )
5725
        aSig |= 0x00800000;
5726
    else if ( aSig == 0 )
5727
        return a;
5728

    
5729
    aExp += n - 1;
5730
    aSig <<= 7;
5731
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5732
}
5733

    
5734
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5735
{
5736
    flag aSign;
5737
    int16 aExp;
5738
    bits64 aSig;
5739

    
5740
    aSig = extractFloat64Frac( a );
5741
    aExp = extractFloat64Exp( a );
5742
    aSign = extractFloat64Sign( a );
5743

    
5744
    if ( aExp == 0x7FF ) {
5745
        return a;
5746
    }
5747
    if ( aExp != 0 )
5748
        aSig |= LIT64( 0x0010000000000000 );
5749
    else if ( aSig == 0 )
5750
        return a;
5751

    
5752
    aExp += n - 1;
5753
    aSig <<= 10;
5754
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5755
}
5756

    
5757
#ifdef FLOATX80
5758
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5759
{
5760
    flag aSign;
5761
    int16 aExp;
5762
    bits64 aSig;
5763

    
5764
    aSig = extractFloatx80Frac( a );
5765
    aExp = extractFloatx80Exp( a );
5766
    aSign = extractFloatx80Sign( a );
5767

    
5768
    if ( aExp == 0x7FF ) {
5769
        return a;
5770
    }
5771
    if (aExp == 0 && aSig == 0)
5772
        return a;
5773

    
5774
    aExp += n;
5775
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5776
                                          aSign, aExp, aSig, 0 STATUS_VAR );
5777
}
5778
#endif
5779

    
5780
#ifdef FLOAT128
5781
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5782
{
5783
    flag aSign;
5784
    int32 aExp;
5785
    bits64 aSig0, aSig1;
5786

    
5787
    aSig1 = extractFloat128Frac1( a );
5788
    aSig0 = extractFloat128Frac0( a );
5789
    aExp = extractFloat128Exp( a );
5790
    aSign = extractFloat128Sign( a );
5791
    if ( aExp == 0x7FFF ) {
5792
        return a;
5793
    }
5794
    if ( aExp != 0 )
5795
        aSig0 |= LIT64( 0x0001000000000000 );
5796
    else if ( aSig0 == 0 && aSig1 == 0 )
5797
        return a;
5798

    
5799
    aExp += n - 1;
5800
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5801
                                          STATUS_VAR );
5802

    
5803
}
5804
#endif