Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ b53d44e5

History | View | Annotate | Download (198.1 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
/* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
34
   be flushed to zero.  */
35
#include "softfloat.h"
36

    
37
/*----------------------------------------------------------------------------
38
| Primitive arithmetic functions, including multi-word arithmetic, and
39
| division and square root approximations.  (Can be specialized to target if
40
| desired.)
41
*----------------------------------------------------------------------------*/
42
#include "softfloat-macros.h"
43

    
44
/*----------------------------------------------------------------------------
45
| Functions and definitions to determine:  (1) whether tininess for underflow
46
| is detected before or after rounding by default, (2) what (if anything)
47
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
48
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49
| are propagated from function inputs to output.  These details are target-
50
| specific.
51
*----------------------------------------------------------------------------*/
52
#include "softfloat-specialize.h"
53

    
54
void set_float_rounding_mode(int val STATUS_PARAM)
55
{
56
    STATUS(float_rounding_mode) = val;
57
}
58

    
59
void set_float_exception_flags(int val STATUS_PARAM)
60
{
61
    STATUS(float_exception_flags) = val;
62
}
63

    
64
#ifdef FLOATX80
65
void set_floatx80_rounding_precision(int val STATUS_PARAM)
66
{
67
    STATUS(floatx80_rounding_precision) = val;
68
}
69
#endif
70

    
71
/*----------------------------------------------------------------------------
72
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73
| and 7, and returns the properly rounded 32-bit integer corresponding to the
74
| input.  If `zSign' is 1, the input is negated before being converted to an
75
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
76
| is simply rounded to an integer, with the inexact exception raised if the
77
| input cannot be represented exactly as an integer.  However, if the fixed-
78
| point input is too large, the invalid exception is raised and the largest
79
| positive or negative integer is returned.
80
*----------------------------------------------------------------------------*/
81

    
82
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83
{
84
    int8 roundingMode;
85
    flag roundNearestEven;
86
    int8 roundIncrement, roundBits;
87
    int32 z;
88

    
89
    roundingMode = STATUS(float_rounding_mode);
90
    roundNearestEven = ( roundingMode == float_round_nearest_even );
91
    roundIncrement = 0x40;
92
    if ( ! roundNearestEven ) {
93
        if ( roundingMode == float_round_to_zero ) {
94
            roundIncrement = 0;
95
        }
96
        else {
97
            roundIncrement = 0x7F;
98
            if ( zSign ) {
99
                if ( roundingMode == float_round_up ) roundIncrement = 0;
100
            }
101
            else {
102
                if ( roundingMode == float_round_down ) roundIncrement = 0;
103
            }
104
        }
105
    }
106
    roundBits = absZ & 0x7F;
107
    absZ = ( absZ + roundIncrement )>>7;
108
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109
    z = absZ;
110
    if ( zSign ) z = - z;
111
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112
        float_raise( float_flag_invalid STATUS_VAR);
113
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114
    }
115
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116
    return z;
117

    
118
}
119

    
120
/*----------------------------------------------------------------------------
121
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122
| `absZ1', with binary point between bits 63 and 64 (between the input words),
123
| and returns the properly rounded 64-bit integer corresponding to the input.
124
| If `zSign' is 1, the input is negated before being converted to an integer.
125
| Ordinarily, the fixed-point input is simply rounded to an integer, with
126
| the inexact exception raised if the input cannot be represented exactly as
127
| an integer.  However, if the fixed-point input is too large, the invalid
128
| exception is raised and the largest positive or negative integer is
129
| returned.
130
*----------------------------------------------------------------------------*/
131

    
132
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133
{
134
    int8 roundingMode;
135
    flag roundNearestEven, increment;
136
    int64 z;
137

    
138
    roundingMode = STATUS(float_rounding_mode);
139
    roundNearestEven = ( roundingMode == float_round_nearest_even );
140
    increment = ( (sbits64) absZ1 < 0 );
141
    if ( ! roundNearestEven ) {
142
        if ( roundingMode == float_round_to_zero ) {
143
            increment = 0;
144
        }
145
        else {
146
            if ( zSign ) {
147
                increment = ( roundingMode == float_round_down ) && absZ1;
148
            }
149
            else {
150
                increment = ( roundingMode == float_round_up ) && absZ1;
151
            }
152
        }
153
    }
154
    if ( increment ) {
155
        ++absZ0;
156
        if ( absZ0 == 0 ) goto overflow;
157
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158
    }
159
    z = absZ0;
160
    if ( zSign ) z = - z;
161
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
162
 overflow:
163
        float_raise( float_flag_invalid STATUS_VAR);
164
        return
165
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
166
            : LIT64( 0x7FFFFFFFFFFFFFFF );
167
    }
168
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169
    return z;
170

    
171
}
172

    
173
/*----------------------------------------------------------------------------
174
| Returns the fraction bits of the single-precision floating-point value `a'.
175
*----------------------------------------------------------------------------*/
176

    
177
INLINE bits32 extractFloat32Frac( float32 a )
178
{
179

    
180
    return float32_val(a) & 0x007FFFFF;
181

    
182
}
183

    
184
/*----------------------------------------------------------------------------
185
| Returns the exponent bits of the single-precision floating-point value `a'.
186
*----------------------------------------------------------------------------*/
187

    
188
INLINE int16 extractFloat32Exp( float32 a )
189
{
190

    
191
    return ( float32_val(a)>>23 ) & 0xFF;
192

    
193
}
194

    
195
/*----------------------------------------------------------------------------
196
| Returns the sign bit of the single-precision floating-point value `a'.
197
*----------------------------------------------------------------------------*/
198

    
199
INLINE flag extractFloat32Sign( float32 a )
200
{
201

    
202
    return float32_val(a)>>31;
203

    
204
}
205

    
206
/*----------------------------------------------------------------------------
207
| Normalizes the subnormal single-precision floating-point value represented
208
| by the denormalized significand `aSig'.  The normalized exponent and
209
| significand are stored at the locations pointed to by `zExpPtr' and
210
| `zSigPtr', respectively.
211
*----------------------------------------------------------------------------*/
212

    
213
static void
214
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
215
{
216
    int8 shiftCount;
217

    
218
    shiftCount = countLeadingZeros32( aSig ) - 8;
219
    *zSigPtr = aSig<<shiftCount;
220
    *zExpPtr = 1 - shiftCount;
221

    
222
}
223

    
224
/*----------------------------------------------------------------------------
225
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226
| single-precision floating-point value, returning the result.  After being
227
| shifted into the proper positions, the three fields are simply added
228
| together to form the result.  This means that any integer portion of `zSig'
229
| will be added into the exponent.  Since a properly normalized significand
230
| will have an integer portion equal to 1, the `zExp' input should be 1 less
231
| than the desired result exponent whenever `zSig' is a complete, normalized
232
| significand.
233
*----------------------------------------------------------------------------*/
234

    
235
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236
{
237

    
238
    return make_float32(
239
          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240

    
241
}
242

    
243
/*----------------------------------------------------------------------------
244
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245
| and significand `zSig', and returns the proper single-precision floating-
246
| point value corresponding to the abstract input.  Ordinarily, the abstract
247
| value is simply rounded and packed into the single-precision format, with
248
| the inexact exception raised if the abstract input cannot be represented
249
| exactly.  However, if the abstract value is too large, the overflow and
250
| inexact exceptions are raised and an infinity or maximal finite value is
251
| returned.  If the abstract value is too small, the input value is rounded to
252
| a subnormal number, and the underflow and inexact exceptions are raised if
253
| the abstract input cannot be represented exactly as a subnormal single-
254
| precision floating-point number.
255
|     The input significand `zSig' has its binary point between bits 30
256
| and 29, which is 7 bits to the left of the usual location.  This shifted
257
| significand must be normalized or smaller.  If `zSig' is not normalized,
258
| `zExp' must be 0; in that case, the result returned is a subnormal number,
259
| and it must not require rounding.  In the usual case that `zSig' is
260
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261
| The handling of underflow and overflow follows the IEC/IEEE Standard for
262
| Binary Floating-Point Arithmetic.
263
*----------------------------------------------------------------------------*/
264

    
265
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266
{
267
    int8 roundingMode;
268
    flag roundNearestEven;
269
    int8 roundIncrement, roundBits;
270
    flag isTiny;
271

    
272
    roundingMode = STATUS(float_rounding_mode);
273
    roundNearestEven = ( roundingMode == float_round_nearest_even );
274
    roundIncrement = 0x40;
275
    if ( ! roundNearestEven ) {
276
        if ( roundingMode == float_round_to_zero ) {
277
            roundIncrement = 0;
278
        }
279
        else {
280
            roundIncrement = 0x7F;
281
            if ( zSign ) {
282
                if ( roundingMode == float_round_up ) roundIncrement = 0;
283
            }
284
            else {
285
                if ( roundingMode == float_round_down ) roundIncrement = 0;
286
            }
287
        }
288
    }
289
    roundBits = zSig & 0x7F;
290
    if ( 0xFD <= (bits16) zExp ) {
291
        if (    ( 0xFD < zExp )
292
             || (    ( zExp == 0xFD )
293
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294
           ) {
295
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297
        }
298
        if ( zExp < 0 ) {
299
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300
            isTiny =
301
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302
                || ( zExp < -1 )
303
                || ( zSig + roundIncrement < 0x80000000 );
304
            shift32RightJamming( zSig, - zExp, &zSig );
305
            zExp = 0;
306
            roundBits = zSig & 0x7F;
307
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308
        }
309
    }
310
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311
    zSig = ( zSig + roundIncrement )>>7;
312
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313
    if ( zSig == 0 ) zExp = 0;
314
    return packFloat32( zSign, zExp, zSig );
315

    
316
}
317

    
318
/*----------------------------------------------------------------------------
319
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320
| and significand `zSig', and returns the proper single-precision floating-
321
| point value corresponding to the abstract input.  This routine is just like
322
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324
| floating-point exponent.
325
*----------------------------------------------------------------------------*/
326

    
327
static float32
328
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329
{
330
    int8 shiftCount;
331

    
332
    shiftCount = countLeadingZeros32( zSig ) - 1;
333
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334

    
335
}
336

    
337
/*----------------------------------------------------------------------------
338
| Returns the fraction bits of the double-precision floating-point value `a'.
339
*----------------------------------------------------------------------------*/
340

    
341
INLINE bits64 extractFloat64Frac( float64 a )
342
{
343

    
344
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345

    
346
}
347

    
348
/*----------------------------------------------------------------------------
349
| Returns the exponent bits of the double-precision floating-point value `a'.
350
*----------------------------------------------------------------------------*/
351

    
352
INLINE int16 extractFloat64Exp( float64 a )
353
{
354

    
355
    return ( float64_val(a)>>52 ) & 0x7FF;
356

    
357
}
358

    
359
/*----------------------------------------------------------------------------
360
| Returns the sign bit of the double-precision floating-point value `a'.
361
*----------------------------------------------------------------------------*/
362

    
363
INLINE flag extractFloat64Sign( float64 a )
364
{
365

    
366
    return float64_val(a)>>63;
367

    
368
}
369

    
370
/*----------------------------------------------------------------------------
371
| Normalizes the subnormal double-precision floating-point value represented
372
| by the denormalized significand `aSig'.  The normalized exponent and
373
| significand are stored at the locations pointed to by `zExpPtr' and
374
| `zSigPtr', respectively.
375
*----------------------------------------------------------------------------*/
376

    
377
static void
378
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
379
{
380
    int8 shiftCount;
381

    
382
    shiftCount = countLeadingZeros64( aSig ) - 11;
383
    *zSigPtr = aSig<<shiftCount;
384
    *zExpPtr = 1 - shiftCount;
385

    
386
}
387

    
388
/*----------------------------------------------------------------------------
389
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390
| double-precision floating-point value, returning the result.  After being
391
| shifted into the proper positions, the three fields are simply added
392
| together to form the result.  This means that any integer portion of `zSig'
393
| will be added into the exponent.  Since a properly normalized significand
394
| will have an integer portion equal to 1, the `zExp' input should be 1 less
395
| than the desired result exponent whenever `zSig' is a complete, normalized
396
| significand.
397
*----------------------------------------------------------------------------*/
398

    
399
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400
{
401

    
402
    return make_float64(
403
        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404

    
405
}
406

    
407
/*----------------------------------------------------------------------------
408
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409
| and significand `zSig', and returns the proper double-precision floating-
410
| point value corresponding to the abstract input.  Ordinarily, the abstract
411
| value is simply rounded and packed into the double-precision format, with
412
| the inexact exception raised if the abstract input cannot be represented
413
| exactly.  However, if the abstract value is too large, the overflow and
414
| inexact exceptions are raised and an infinity or maximal finite value is
415
| returned.  If the abstract value is too small, the input value is rounded
416
| to a subnormal number, and the underflow and inexact exceptions are raised
417
| if the abstract input cannot be represented exactly as a subnormal double-
418
| precision floating-point number.
419
|     The input significand `zSig' has its binary point between bits 62
420
| and 61, which is 10 bits to the left of the usual location.  This shifted
421
| significand must be normalized or smaller.  If `zSig' is not normalized,
422
| `zExp' must be 0; in that case, the result returned is a subnormal number,
423
| and it must not require rounding.  In the usual case that `zSig' is
424
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425
| The handling of underflow and overflow follows the IEC/IEEE Standard for
426
| Binary Floating-Point Arithmetic.
427
*----------------------------------------------------------------------------*/
428

    
429
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430
{
431
    int8 roundingMode;
432
    flag roundNearestEven;
433
    int16 roundIncrement, roundBits;
434
    flag isTiny;
435

    
436
    roundingMode = STATUS(float_rounding_mode);
437
    roundNearestEven = ( roundingMode == float_round_nearest_even );
438
    roundIncrement = 0x200;
439
    if ( ! roundNearestEven ) {
440
        if ( roundingMode == float_round_to_zero ) {
441
            roundIncrement = 0;
442
        }
443
        else {
444
            roundIncrement = 0x3FF;
445
            if ( zSign ) {
446
                if ( roundingMode == float_round_up ) roundIncrement = 0;
447
            }
448
            else {
449
                if ( roundingMode == float_round_down ) roundIncrement = 0;
450
            }
451
        }
452
    }
453
    roundBits = zSig & 0x3FF;
454
    if ( 0x7FD <= (bits16) zExp ) {
455
        if (    ( 0x7FD < zExp )
456
             || (    ( zExp == 0x7FD )
457
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458
           ) {
459
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461
        }
462
        if ( zExp < 0 ) {
463
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464
            isTiny =
465
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466
                || ( zExp < -1 )
467
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468
            shift64RightJamming( zSig, - zExp, &zSig );
469
            zExp = 0;
470
            roundBits = zSig & 0x3FF;
471
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472
        }
473
    }
474
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475
    zSig = ( zSig + roundIncrement )>>10;
476
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477
    if ( zSig == 0 ) zExp = 0;
478
    return packFloat64( zSign, zExp, zSig );
479

    
480
}
481

    
482
/*----------------------------------------------------------------------------
483
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484
| and significand `zSig', and returns the proper double-precision floating-
485
| point value corresponding to the abstract input.  This routine is just like
486
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488
| floating-point exponent.
489
*----------------------------------------------------------------------------*/
490

    
491
static float64
492
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493
{
494
    int8 shiftCount;
495

    
496
    shiftCount = countLeadingZeros64( zSig ) - 1;
497
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498

    
499
}
500

    
501
#ifdef FLOATX80
502

    
503
/*----------------------------------------------------------------------------
504
| Returns the fraction bits of the extended double-precision floating-point
505
| value `a'.
506
*----------------------------------------------------------------------------*/
507

    
508
INLINE bits64 extractFloatx80Frac( floatx80 a )
509
{
510

    
511
    return a.low;
512

    
513
}
514

    
515
/*----------------------------------------------------------------------------
516
| Returns the exponent bits of the extended double-precision floating-point
517
| value `a'.
518
*----------------------------------------------------------------------------*/
519

    
520
INLINE int32 extractFloatx80Exp( floatx80 a )
521
{
522

    
523
    return a.high & 0x7FFF;
524

    
525
}
526

    
527
/*----------------------------------------------------------------------------
528
| Returns the sign bit of the extended double-precision floating-point value
529
| `a'.
530
*----------------------------------------------------------------------------*/
531

    
532
INLINE flag extractFloatx80Sign( floatx80 a )
533
{
534

    
535
    return a.high>>15;
536

    
537
}
538

    
539
/*----------------------------------------------------------------------------
540
| Normalizes the subnormal extended double-precision floating-point value
541
| represented by the denormalized significand `aSig'.  The normalized exponent
542
| and significand are stored at the locations pointed to by `zExpPtr' and
543
| `zSigPtr', respectively.
544
*----------------------------------------------------------------------------*/
545

    
546
static void
547
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
548
{
549
    int8 shiftCount;
550

    
551
    shiftCount = countLeadingZeros64( aSig );
552
    *zSigPtr = aSig<<shiftCount;
553
    *zExpPtr = 1 - shiftCount;
554

    
555
}
556

    
557
/*----------------------------------------------------------------------------
558
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559
| extended double-precision floating-point value, returning the result.
560
*----------------------------------------------------------------------------*/
561

    
562
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563
{
564
    floatx80 z;
565

    
566
    z.low = zSig;
567
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568
    return z;
569

    
570
}
571

    
572
/*----------------------------------------------------------------------------
573
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
575
| and returns the proper extended double-precision floating-point value
576
| corresponding to the abstract input.  Ordinarily, the abstract value is
577
| rounded and packed into the extended double-precision format, with the
578
| inexact exception raised if the abstract input cannot be represented
579
| exactly.  However, if the abstract value is too large, the overflow and
580
| inexact exceptions are raised and an infinity or maximal finite value is
581
| returned.  If the abstract value is too small, the input value is rounded to
582
| a subnormal number, and the underflow and inexact exceptions are raised if
583
| the abstract input cannot be represented exactly as a subnormal extended
584
| double-precision floating-point number.
585
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
586
| number of bits as single or double precision, respectively.  Otherwise, the
587
| result is rounded to the full precision of the extended double-precision
588
| format.
589
|     The input significand must be normalized or smaller.  If the input
590
| significand is not normalized, `zExp' must be 0; in that case, the result
591
| returned is a subnormal number, and it must not require rounding.  The
592
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593
| Floating-Point Arithmetic.
594
*----------------------------------------------------------------------------*/
595

    
596
static floatx80
597
 roundAndPackFloatx80(
598
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599
 STATUS_PARAM)
600
{
601
    int8 roundingMode;
602
    flag roundNearestEven, increment, isTiny;
603
    int64 roundIncrement, roundMask, roundBits;
604

    
605
    roundingMode = STATUS(float_rounding_mode);
606
    roundNearestEven = ( roundingMode == float_round_nearest_even );
607
    if ( roundingPrecision == 80 ) goto precision80;
608
    if ( roundingPrecision == 64 ) {
609
        roundIncrement = LIT64( 0x0000000000000400 );
610
        roundMask = LIT64( 0x00000000000007FF );
611
    }
612
    else if ( roundingPrecision == 32 ) {
613
        roundIncrement = LIT64( 0x0000008000000000 );
614
        roundMask = LIT64( 0x000000FFFFFFFFFF );
615
    }
616
    else {
617
        goto precision80;
618
    }
619
    zSig0 |= ( zSig1 != 0 );
620
    if ( ! roundNearestEven ) {
621
        if ( roundingMode == float_round_to_zero ) {
622
            roundIncrement = 0;
623
        }
624
        else {
625
            roundIncrement = roundMask;
626
            if ( zSign ) {
627
                if ( roundingMode == float_round_up ) roundIncrement = 0;
628
            }
629
            else {
630
                if ( roundingMode == float_round_down ) roundIncrement = 0;
631
            }
632
        }
633
    }
634
    roundBits = zSig0 & roundMask;
635
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636
        if (    ( 0x7FFE < zExp )
637
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638
           ) {
639
            goto overflow;
640
        }
641
        if ( zExp <= 0 ) {
642
            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643
            isTiny =
644
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645
                || ( zExp < 0 )
646
                || ( zSig0 <= zSig0 + roundIncrement );
647
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648
            zExp = 0;
649
            roundBits = zSig0 & roundMask;
650
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652
            zSig0 += roundIncrement;
653
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
654
            roundIncrement = roundMask + 1;
655
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656
                roundMask |= roundIncrement;
657
            }
658
            zSig0 &= ~ roundMask;
659
            return packFloatx80( zSign, zExp, zSig0 );
660
        }
661
    }
662
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663
    zSig0 += roundIncrement;
664
    if ( zSig0 < roundIncrement ) {
665
        ++zExp;
666
        zSig0 = LIT64( 0x8000000000000000 );
667
    }
668
    roundIncrement = roundMask + 1;
669
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670
        roundMask |= roundIncrement;
671
    }
672
    zSig0 &= ~ roundMask;
673
    if ( zSig0 == 0 ) zExp = 0;
674
    return packFloatx80( zSign, zExp, zSig0 );
675
 precision80:
676
    increment = ( (sbits64) zSig1 < 0 );
677
    if ( ! roundNearestEven ) {
678
        if ( roundingMode == float_round_to_zero ) {
679
            increment = 0;
680
        }
681
        else {
682
            if ( zSign ) {
683
                increment = ( roundingMode == float_round_down ) && zSig1;
684
            }
685
            else {
686
                increment = ( roundingMode == float_round_up ) && zSig1;
687
            }
688
        }
689
    }
690
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691
        if (    ( 0x7FFE < zExp )
692
             || (    ( zExp == 0x7FFE )
693
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694
                  && increment
695
                )
696
           ) {
697
            roundMask = 0;
698
 overflow:
699
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700
            if (    ( roundingMode == float_round_to_zero )
701
                 || ( zSign && ( roundingMode == float_round_up ) )
702
                 || ( ! zSign && ( roundingMode == float_round_down ) )
703
               ) {
704
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705
            }
706
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707
        }
708
        if ( zExp <= 0 ) {
709
            isTiny =
710
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711
                || ( zExp < 0 )
712
                || ! increment
713
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715
            zExp = 0;
716
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718
            if ( roundNearestEven ) {
719
                increment = ( (sbits64) zSig1 < 0 );
720
            }
721
            else {
722
                if ( zSign ) {
723
                    increment = ( roundingMode == float_round_down ) && zSig1;
724
                }
725
                else {
726
                    increment = ( roundingMode == float_round_up ) && zSig1;
727
                }
728
            }
729
            if ( increment ) {
730
                ++zSig0;
731
                zSig0 &=
732
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
734
            }
735
            return packFloatx80( zSign, zExp, zSig0 );
736
        }
737
    }
738
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739
    if ( increment ) {
740
        ++zSig0;
741
        if ( zSig0 == 0 ) {
742
            ++zExp;
743
            zSig0 = LIT64( 0x8000000000000000 );
744
        }
745
        else {
746
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747
        }
748
    }
749
    else {
750
        if ( zSig0 == 0 ) zExp = 0;
751
    }
752
    return packFloatx80( zSign, zExp, zSig0 );
753

    
754
}
755

    
756
/*----------------------------------------------------------------------------
757
| Takes an abstract floating-point value having sign `zSign', exponent
758
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759
| and returns the proper extended double-precision floating-point value
760
| corresponding to the abstract input.  This routine is just like
761
| `roundAndPackFloatx80' except that the input significand does not have to be
762
| normalized.
763
*----------------------------------------------------------------------------*/
764

    
765
static floatx80
766
 normalizeRoundAndPackFloatx80(
767
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768
 STATUS_PARAM)
769
{
770
    int8 shiftCount;
771

    
772
    if ( zSig0 == 0 ) {
773
        zSig0 = zSig1;
774
        zSig1 = 0;
775
        zExp -= 64;
776
    }
777
    shiftCount = countLeadingZeros64( zSig0 );
778
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779
    zExp -= shiftCount;
780
    return
781
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782

    
783
}
784

    
785
#endif
786

    
787
#ifdef FLOAT128
788

    
789
/*----------------------------------------------------------------------------
790
| Returns the least-significant 64 fraction bits of the quadruple-precision
791
| floating-point value `a'.
792
*----------------------------------------------------------------------------*/
793

    
794
INLINE bits64 extractFloat128Frac1( float128 a )
795
{
796

    
797
    return a.low;
798

    
799
}
800

    
801
/*----------------------------------------------------------------------------
802
| Returns the most-significant 48 fraction bits of the quadruple-precision
803
| floating-point value `a'.
804
*----------------------------------------------------------------------------*/
805

    
806
INLINE bits64 extractFloat128Frac0( float128 a )
807
{
808

    
809
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810

    
811
}
812

    
813
/*----------------------------------------------------------------------------
814
| Returns the exponent bits of the quadruple-precision floating-point value
815
| `a'.
816
*----------------------------------------------------------------------------*/
817

    
818
INLINE int32 extractFloat128Exp( float128 a )
819
{
820

    
821
    return ( a.high>>48 ) & 0x7FFF;
822

    
823
}
824

    
825
/*----------------------------------------------------------------------------
826
| Returns the sign bit of the quadruple-precision floating-point value `a'.
827
*----------------------------------------------------------------------------*/
828

    
829
INLINE flag extractFloat128Sign( float128 a )
830
{
831

    
832
    return a.high>>63;
833

    
834
}
835

    
836
/*----------------------------------------------------------------------------
837
| Normalizes the subnormal quadruple-precision floating-point value
838
| represented by the denormalized significand formed by the concatenation of
839
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
840
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
841
| significand are stored at the location pointed to by `zSig0Ptr', and the
842
| least significant 64 bits of the normalized significand are stored at the
843
| location pointed to by `zSig1Ptr'.
844
*----------------------------------------------------------------------------*/
845

    
846
static void
847
 normalizeFloat128Subnormal(
848
     bits64 aSig0,
849
     bits64 aSig1,
850
     int32 *zExpPtr,
851
     bits64 *zSig0Ptr,
852
     bits64 *zSig1Ptr
853
 )
854
{
855
    int8 shiftCount;
856

    
857
    if ( aSig0 == 0 ) {
858
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
859
        if ( shiftCount < 0 ) {
860
            *zSig0Ptr = aSig1>>( - shiftCount );
861
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
862
        }
863
        else {
864
            *zSig0Ptr = aSig1<<shiftCount;
865
            *zSig1Ptr = 0;
866
        }
867
        *zExpPtr = - shiftCount - 63;
868
    }
869
    else {
870
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
871
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872
        *zExpPtr = 1 - shiftCount;
873
    }
874

    
875
}
876

    
877
/*----------------------------------------------------------------------------
878
| Packs the sign `zSign', the exponent `zExp', and the significand formed
879
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880
| floating-point value, returning the result.  After being shifted into the
881
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882
| added together to form the most significant 32 bits of the result.  This
883
| means that any integer portion of `zSig0' will be added into the exponent.
884
| Since a properly normalized significand will have an integer portion equal
885
| to 1, the `zExp' input should be 1 less than the desired result exponent
886
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887
| significand.
888
*----------------------------------------------------------------------------*/
889

    
890
INLINE float128
891
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892
{
893
    float128 z;
894

    
895
    z.low = zSig1;
896
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897
    return z;
898

    
899
}
900

    
901
/*----------------------------------------------------------------------------
902
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903
| and extended significand formed by the concatenation of `zSig0', `zSig1',
904
| and `zSig2', and returns the proper quadruple-precision floating-point value
905
| corresponding to the abstract input.  Ordinarily, the abstract value is
906
| simply rounded and packed into the quadruple-precision format, with the
907
| inexact exception raised if the abstract input cannot be represented
908
| exactly.  However, if the abstract value is too large, the overflow and
909
| inexact exceptions are raised and an infinity or maximal finite value is
910
| returned.  If the abstract value is too small, the input value is rounded to
911
| a subnormal number, and the underflow and inexact exceptions are raised if
912
| the abstract input cannot be represented exactly as a subnormal quadruple-
913
| precision floating-point number.
914
|     The input significand must be normalized or smaller.  If the input
915
| significand is not normalized, `zExp' must be 0; in that case, the result
916
| returned is a subnormal number, and it must not require rounding.  In the
917
| usual case that the input significand is normalized, `zExp' must be 1 less
918
| than the ``true'' floating-point exponent.  The handling of underflow and
919
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920
*----------------------------------------------------------------------------*/
921

    
922
static float128
923
 roundAndPackFloat128(
924
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925
{
926
    int8 roundingMode;
927
    flag roundNearestEven, increment, isTiny;
928

    
929
    roundingMode = STATUS(float_rounding_mode);
930
    roundNearestEven = ( roundingMode == float_round_nearest_even );
931
    increment = ( (sbits64) zSig2 < 0 );
932
    if ( ! roundNearestEven ) {
933
        if ( roundingMode == float_round_to_zero ) {
934
            increment = 0;
935
        }
936
        else {
937
            if ( zSign ) {
938
                increment = ( roundingMode == float_round_down ) && zSig2;
939
            }
940
            else {
941
                increment = ( roundingMode == float_round_up ) && zSig2;
942
            }
943
        }
944
    }
945
    if ( 0x7FFD <= (bits32) zExp ) {
946
        if (    ( 0x7FFD < zExp )
947
             || (    ( zExp == 0x7FFD )
948
                  && eq128(
949
                         LIT64( 0x0001FFFFFFFFFFFF ),
950
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
951
                         zSig0,
952
                         zSig1
953
                     )
954
                  && increment
955
                )
956
           ) {
957
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958
            if (    ( roundingMode == float_round_to_zero )
959
                 || ( zSign && ( roundingMode == float_round_up ) )
960
                 || ( ! zSign && ( roundingMode == float_round_down ) )
961
               ) {
962
                return
963
                    packFloat128(
964
                        zSign,
965
                        0x7FFE,
966
                        LIT64( 0x0000FFFFFFFFFFFF ),
967
                        LIT64( 0xFFFFFFFFFFFFFFFF )
968
                    );
969
            }
970
            return packFloat128( zSign, 0x7FFF, 0, 0 );
971
        }
972
        if ( zExp < 0 ) {
973
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974
            isTiny =
975
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976
                || ( zExp < -1 )
977
                || ! increment
978
                || lt128(
979
                       zSig0,
980
                       zSig1,
981
                       LIT64( 0x0001FFFFFFFFFFFF ),
982
                       LIT64( 0xFFFFFFFFFFFFFFFF )
983
                   );
984
            shift128ExtraRightJamming(
985
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986
            zExp = 0;
987
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988
            if ( roundNearestEven ) {
989
                increment = ( (sbits64) zSig2 < 0 );
990
            }
991
            else {
992
                if ( zSign ) {
993
                    increment = ( roundingMode == float_round_down ) && zSig2;
994
                }
995
                else {
996
                    increment = ( roundingMode == float_round_up ) && zSig2;
997
                }
998
            }
999
        }
1000
    }
1001
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002
    if ( increment ) {
1003
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005
    }
1006
    else {
1007
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008
    }
1009
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1010

    
1011
}
1012

    
1013
/*----------------------------------------------------------------------------
1014
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016
| returns the proper quadruple-precision floating-point value corresponding
1017
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1018
| except that the input significand has fewer bits and does not have to be
1019
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020
| point exponent.
1021
*----------------------------------------------------------------------------*/
1022

    
1023
static float128
1024
 normalizeRoundAndPackFloat128(
1025
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026
{
1027
    int8 shiftCount;
1028
    bits64 zSig2;
1029

    
1030
    if ( zSig0 == 0 ) {
1031
        zSig0 = zSig1;
1032
        zSig1 = 0;
1033
        zExp -= 64;
1034
    }
1035
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036
    if ( 0 <= shiftCount ) {
1037
        zSig2 = 0;
1038
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039
    }
1040
    else {
1041
        shift128ExtraRightJamming(
1042
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043
    }
1044
    zExp -= shiftCount;
1045
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046

    
1047
}
1048

    
1049
#endif
1050

    
1051
/*----------------------------------------------------------------------------
1052
| Returns the result of converting the 32-bit two's complement integer `a'
1053
| to the single-precision floating-point format.  The conversion is performed
1054
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055
*----------------------------------------------------------------------------*/
1056

    
1057
float32 int32_to_float32( int32 a STATUS_PARAM )
1058
{
1059
    flag zSign;
1060

    
1061
    if ( a == 0 ) return float32_zero;
1062
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063
    zSign = ( a < 0 );
1064
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065

    
1066
}
1067

    
1068
/*----------------------------------------------------------------------------
1069
| Returns the result of converting the 32-bit two's complement integer `a'
1070
| to the double-precision floating-point format.  The conversion is performed
1071
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072
*----------------------------------------------------------------------------*/
1073

    
1074
float64 int32_to_float64( int32 a STATUS_PARAM )
1075
{
1076
    flag zSign;
1077
    uint32 absA;
1078
    int8 shiftCount;
1079
    bits64 zSig;
1080

    
1081
    if ( a == 0 ) return float64_zero;
1082
    zSign = ( a < 0 );
1083
    absA = zSign ? - a : a;
1084
    shiftCount = countLeadingZeros32( absA ) + 21;
1085
    zSig = absA;
1086
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087

    
1088
}
1089

    
1090
#ifdef FLOATX80
1091

    
1092
/*----------------------------------------------------------------------------
1093
| Returns the result of converting the 32-bit two's complement integer `a'
1094
| to the extended double-precision floating-point format.  The conversion
1095
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096
| Arithmetic.
1097
*----------------------------------------------------------------------------*/
1098

    
1099
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100
{
1101
    flag zSign;
1102
    uint32 absA;
1103
    int8 shiftCount;
1104
    bits64 zSig;
1105

    
1106
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107
    zSign = ( a < 0 );
1108
    absA = zSign ? - a : a;
1109
    shiftCount = countLeadingZeros32( absA ) + 32;
1110
    zSig = absA;
1111
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112

    
1113
}
1114

    
1115
#endif
1116

    
1117
#ifdef FLOAT128
1118

    
1119
/*----------------------------------------------------------------------------
1120
| Returns the result of converting the 32-bit two's complement integer `a' to
1121
| the quadruple-precision floating-point format.  The conversion is performed
1122
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123
*----------------------------------------------------------------------------*/
1124

    
1125
float128 int32_to_float128( int32 a STATUS_PARAM )
1126
{
1127
    flag zSign;
1128
    uint32 absA;
1129
    int8 shiftCount;
1130
    bits64 zSig0;
1131

    
1132
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133
    zSign = ( a < 0 );
1134
    absA = zSign ? - a : a;
1135
    shiftCount = countLeadingZeros32( absA ) + 17;
1136
    zSig0 = absA;
1137
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138

    
1139
}
1140

    
1141
#endif
1142

    
1143
/*----------------------------------------------------------------------------
1144
| Returns the result of converting the 64-bit two's complement integer `a'
1145
| to the single-precision floating-point format.  The conversion is performed
1146
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147
*----------------------------------------------------------------------------*/
1148

    
1149
float32 int64_to_float32( int64 a STATUS_PARAM )
1150
{
1151
    flag zSign;
1152
    uint64 absA;
1153
    int8 shiftCount;
1154

    
1155
    if ( a == 0 ) return float32_zero;
1156
    zSign = ( a < 0 );
1157
    absA = zSign ? - a : a;
1158
    shiftCount = countLeadingZeros64( absA ) - 40;
1159
    if ( 0 <= shiftCount ) {
1160
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161
    }
1162
    else {
1163
        shiftCount += 7;
1164
        if ( shiftCount < 0 ) {
1165
            shift64RightJamming( absA, - shiftCount, &absA );
1166
        }
1167
        else {
1168
            absA <<= shiftCount;
1169
        }
1170
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171
    }
1172

    
1173
}
1174

    
1175
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176
{
1177
    int8 shiftCount;
1178

    
1179
    if ( a == 0 ) return float32_zero;
1180
    shiftCount = countLeadingZeros64( a ) - 40;
1181
    if ( 0 <= shiftCount ) {
1182
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183
    }
1184
    else {
1185
        shiftCount += 7;
1186
        if ( shiftCount < 0 ) {
1187
            shift64RightJamming( a, - shiftCount, &a );
1188
        }
1189
        else {
1190
            a <<= shiftCount;
1191
        }
1192
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193
    }
1194
}
1195

    
1196
/*----------------------------------------------------------------------------
1197
| Returns the result of converting the 64-bit two's complement integer `a'
1198
| to the double-precision floating-point format.  The conversion is performed
1199
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200
*----------------------------------------------------------------------------*/
1201

    
1202
float64 int64_to_float64( int64 a STATUS_PARAM )
1203
{
1204
    flag zSign;
1205

    
1206
    if ( a == 0 ) return float64_zero;
1207
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208
        return packFloat64( 1, 0x43E, 0 );
1209
    }
1210
    zSign = ( a < 0 );
1211
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212

    
1213
}
1214

    
1215
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216
{
1217
    if ( a == 0 ) return float64_zero;
1218
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219

    
1220
}
1221

    
1222
#ifdef FLOATX80
1223

    
1224
/*----------------------------------------------------------------------------
1225
| Returns the result of converting the 64-bit two's complement integer `a'
1226
| to the extended double-precision floating-point format.  The conversion
1227
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228
| Arithmetic.
1229
*----------------------------------------------------------------------------*/
1230

    
1231
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232
{
1233
    flag zSign;
1234
    uint64 absA;
1235
    int8 shiftCount;
1236

    
1237
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238
    zSign = ( a < 0 );
1239
    absA = zSign ? - a : a;
1240
    shiftCount = countLeadingZeros64( absA );
1241
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242

    
1243
}
1244

    
1245
#endif
1246

    
1247
#ifdef FLOAT128
1248

    
1249
/*----------------------------------------------------------------------------
1250
| Returns the result of converting the 64-bit two's complement integer `a' to
1251
| the quadruple-precision floating-point format.  The conversion is performed
1252
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253
*----------------------------------------------------------------------------*/
1254

    
1255
float128 int64_to_float128( int64 a STATUS_PARAM )
1256
{
1257
    flag zSign;
1258
    uint64 absA;
1259
    int8 shiftCount;
1260
    int32 zExp;
1261
    bits64 zSig0, zSig1;
1262

    
1263
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264
    zSign = ( a < 0 );
1265
    absA = zSign ? - a : a;
1266
    shiftCount = countLeadingZeros64( absA ) + 49;
1267
    zExp = 0x406E - shiftCount;
1268
    if ( 64 <= shiftCount ) {
1269
        zSig1 = 0;
1270
        zSig0 = absA;
1271
        shiftCount -= 64;
1272
    }
1273
    else {
1274
        zSig1 = absA;
1275
        zSig0 = 0;
1276
    }
1277
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1279

    
1280
}
1281

    
1282
#endif
1283

    
1284
/*----------------------------------------------------------------------------
1285
| Returns the result of converting the single-precision floating-point value
1286
| `a' to the 32-bit two's complement integer format.  The conversion is
1287
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288
| Arithmetic---which means in particular that the conversion is rounded
1289
| according to the current rounding mode.  If `a' is a NaN, the largest
1290
| positive integer is returned.  Otherwise, if the conversion overflows, the
1291
| largest integer with the same sign as `a' is returned.
1292
*----------------------------------------------------------------------------*/
1293

    
1294
int32 float32_to_int32( float32 a STATUS_PARAM )
1295
{
1296
    flag aSign;
1297
    int16 aExp, shiftCount;
1298
    bits32 aSig;
1299
    bits64 aSig64;
1300

    
1301
    aSig = extractFloat32Frac( a );
1302
    aExp = extractFloat32Exp( a );
1303
    aSign = extractFloat32Sign( a );
1304
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305
    if ( aExp ) aSig |= 0x00800000;
1306
    shiftCount = 0xAF - aExp;
1307
    aSig64 = aSig;
1308
    aSig64 <<= 32;
1309
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311

    
1312
}
1313

    
1314
/*----------------------------------------------------------------------------
1315
| Returns the result of converting the single-precision floating-point value
1316
| `a' to the 32-bit two's complement integer format.  The conversion is
1317
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318
| Arithmetic, except that the conversion is always rounded toward zero.
1319
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320
| the conversion overflows, the largest integer with the same sign as `a' is
1321
| returned.
1322
*----------------------------------------------------------------------------*/
1323

    
1324
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325
{
1326
    flag aSign;
1327
    int16 aExp, shiftCount;
1328
    bits32 aSig;
1329
    int32 z;
1330

    
1331
    aSig = extractFloat32Frac( a );
1332
    aExp = extractFloat32Exp( a );
1333
    aSign = extractFloat32Sign( a );
1334
    shiftCount = aExp - 0x9E;
1335
    if ( 0 <= shiftCount ) {
1336
        if ( float32_val(a) != 0xCF000000 ) {
1337
            float_raise( float_flag_invalid STATUS_VAR);
1338
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339
        }
1340
        return (sbits32) 0x80000000;
1341
    }
1342
    else if ( aExp <= 0x7E ) {
1343
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344
        return 0;
1345
    }
1346
    aSig = ( aSig | 0x00800000 )<<8;
1347
    z = aSig>>( - shiftCount );
1348
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349
        STATUS(float_exception_flags) |= float_flag_inexact;
1350
    }
1351
    if ( aSign ) z = - z;
1352
    return z;
1353

    
1354
}
1355

    
1356
/*----------------------------------------------------------------------------
1357
| Returns the result of converting the single-precision floating-point value
1358
| `a' to the 64-bit two's complement integer format.  The conversion is
1359
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360
| Arithmetic---which means in particular that the conversion is rounded
1361
| according to the current rounding mode.  If `a' is a NaN, the largest
1362
| positive integer is returned.  Otherwise, if the conversion overflows, the
1363
| largest integer with the same sign as `a' is returned.
1364
*----------------------------------------------------------------------------*/
1365

    
1366
int64 float32_to_int64( float32 a STATUS_PARAM )
1367
{
1368
    flag aSign;
1369
    int16 aExp, shiftCount;
1370
    bits32 aSig;
1371
    bits64 aSig64, aSigExtra;
1372

    
1373
    aSig = extractFloat32Frac( a );
1374
    aExp = extractFloat32Exp( a );
1375
    aSign = extractFloat32Sign( a );
1376
    shiftCount = 0xBE - aExp;
1377
    if ( shiftCount < 0 ) {
1378
        float_raise( float_flag_invalid STATUS_VAR);
1379
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1381
        }
1382
        return (sbits64) LIT64( 0x8000000000000000 );
1383
    }
1384
    if ( aExp ) aSig |= 0x00800000;
1385
    aSig64 = aSig;
1386
    aSig64 <<= 40;
1387
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389

    
1390
}
1391

    
1392
/*----------------------------------------------------------------------------
1393
| Returns the result of converting the single-precision floating-point value
1394
| `a' to the 64-bit two's complement integer format.  The conversion is
1395
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396
| Arithmetic, except that the conversion is always rounded toward zero.  If
1397
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398
| conversion overflows, the largest integer with the same sign as `a' is
1399
| returned.
1400
*----------------------------------------------------------------------------*/
1401

    
1402
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403
{
1404
    flag aSign;
1405
    int16 aExp, shiftCount;
1406
    bits32 aSig;
1407
    bits64 aSig64;
1408
    int64 z;
1409

    
1410
    aSig = extractFloat32Frac( a );
1411
    aExp = extractFloat32Exp( a );
1412
    aSign = extractFloat32Sign( a );
1413
    shiftCount = aExp - 0xBE;
1414
    if ( 0 <= shiftCount ) {
1415
        if ( float32_val(a) != 0xDF000000 ) {
1416
            float_raise( float_flag_invalid STATUS_VAR);
1417
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1419
            }
1420
        }
1421
        return (sbits64) LIT64( 0x8000000000000000 );
1422
    }
1423
    else if ( aExp <= 0x7E ) {
1424
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425
        return 0;
1426
    }
1427
    aSig64 = aSig | 0x00800000;
1428
    aSig64 <<= 40;
1429
    z = aSig64>>( - shiftCount );
1430
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431
        STATUS(float_exception_flags) |= float_flag_inexact;
1432
    }
1433
    if ( aSign ) z = - z;
1434
    return z;
1435

    
1436
}
1437

    
1438
/*----------------------------------------------------------------------------
1439
| Returns the result of converting the single-precision floating-point value
1440
| `a' to the double-precision floating-point format.  The conversion is
1441
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442
| Arithmetic.
1443
*----------------------------------------------------------------------------*/
1444

    
1445
float64 float32_to_float64( float32 a STATUS_PARAM )
1446
{
1447
    flag aSign;
1448
    int16 aExp;
1449
    bits32 aSig;
1450

    
1451
    aSig = extractFloat32Frac( a );
1452
    aExp = extractFloat32Exp( a );
1453
    aSign = extractFloat32Sign( a );
1454
    if ( aExp == 0xFF ) {
1455
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456
        return packFloat64( aSign, 0x7FF, 0 );
1457
    }
1458
    if ( aExp == 0 ) {
1459
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461
        --aExp;
1462
    }
1463
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464

    
1465
}
1466

    
1467
#ifdef FLOATX80
1468

    
1469
/*----------------------------------------------------------------------------
1470
| Returns the result of converting the single-precision floating-point value
1471
| `a' to the extended double-precision floating-point format.  The conversion
1472
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473
| Arithmetic.
1474
*----------------------------------------------------------------------------*/
1475

    
1476
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477
{
1478
    flag aSign;
1479
    int16 aExp;
1480
    bits32 aSig;
1481

    
1482
    aSig = extractFloat32Frac( a );
1483
    aExp = extractFloat32Exp( a );
1484
    aSign = extractFloat32Sign( a );
1485
    if ( aExp == 0xFF ) {
1486
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488
    }
1489
    if ( aExp == 0 ) {
1490
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492
    }
1493
    aSig |= 0x00800000;
1494
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495

    
1496
}
1497

    
1498
#endif
1499

    
1500
#ifdef FLOAT128
1501

    
1502
/*----------------------------------------------------------------------------
1503
| Returns the result of converting the single-precision floating-point value
1504
| `a' to the double-precision floating-point format.  The conversion is
1505
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506
| Arithmetic.
1507
*----------------------------------------------------------------------------*/
1508

    
1509
float128 float32_to_float128( float32 a STATUS_PARAM )
1510
{
1511
    flag aSign;
1512
    int16 aExp;
1513
    bits32 aSig;
1514

    
1515
    aSig = extractFloat32Frac( a );
1516
    aExp = extractFloat32Exp( a );
1517
    aSign = extractFloat32Sign( a );
1518
    if ( aExp == 0xFF ) {
1519
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1521
    }
1522
    if ( aExp == 0 ) {
1523
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525
        --aExp;
1526
    }
1527
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528

    
1529
}
1530

    
1531
#endif
1532

    
1533
/*----------------------------------------------------------------------------
1534
| Rounds the single-precision floating-point value `a' to an integer, and
1535
| returns the result as a single-precision floating-point value.  The
1536
| operation is performed according to the IEC/IEEE Standard for Binary
1537
| Floating-Point Arithmetic.
1538
*----------------------------------------------------------------------------*/
1539

    
1540
float32 float32_round_to_int( float32 a STATUS_PARAM)
1541
{
1542
    flag aSign;
1543
    int16 aExp;
1544
    bits32 lastBitMask, roundBitsMask;
1545
    int8 roundingMode;
1546
    bits32 z;
1547

    
1548
    aExp = extractFloat32Exp( a );
1549
    if ( 0x96 <= aExp ) {
1550
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551
            return propagateFloat32NaN( a, a STATUS_VAR );
1552
        }
1553
        return a;
1554
    }
1555
    if ( aExp <= 0x7E ) {
1556
        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557
        STATUS(float_exception_flags) |= float_flag_inexact;
1558
        aSign = extractFloat32Sign( a );
1559
        switch ( STATUS(float_rounding_mode) ) {
1560
         case float_round_nearest_even:
1561
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562
                return packFloat32( aSign, 0x7F, 0 );
1563
            }
1564
            break;
1565
         case float_round_down:
1566
            return make_float32(aSign ? 0xBF800000 : 0);
1567
         case float_round_up:
1568
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569
        }
1570
        return packFloat32( aSign, 0, 0 );
1571
    }
1572
    lastBitMask = 1;
1573
    lastBitMask <<= 0x96 - aExp;
1574
    roundBitsMask = lastBitMask - 1;
1575
    z = float32_val(a);
1576
    roundingMode = STATUS(float_rounding_mode);
1577
    if ( roundingMode == float_round_nearest_even ) {
1578
        z += lastBitMask>>1;
1579
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580
    }
1581
    else if ( roundingMode != float_round_to_zero ) {
1582
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583
            z += roundBitsMask;
1584
        }
1585
    }
1586
    z &= ~ roundBitsMask;
1587
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588
    return make_float32(z);
1589

    
1590
}
1591

    
1592
/*----------------------------------------------------------------------------
1593
| Returns the result of adding the absolute values of the single-precision
1594
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595
| before being returned.  `zSign' is ignored if the result is a NaN.
1596
| The addition is performed according to the IEC/IEEE Standard for Binary
1597
| Floating-Point Arithmetic.
1598
*----------------------------------------------------------------------------*/
1599

    
1600
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601
{
1602
    int16 aExp, bExp, zExp;
1603
    bits32 aSig, bSig, zSig;
1604
    int16 expDiff;
1605

    
1606
    aSig = extractFloat32Frac( a );
1607
    aExp = extractFloat32Exp( a );
1608
    bSig = extractFloat32Frac( b );
1609
    bExp = extractFloat32Exp( b );
1610
    expDiff = aExp - bExp;
1611
    aSig <<= 6;
1612
    bSig <<= 6;
1613
    if ( 0 < expDiff ) {
1614
        if ( aExp == 0xFF ) {
1615
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616
            return a;
1617
        }
1618
        if ( bExp == 0 ) {
1619
            --expDiff;
1620
        }
1621
        else {
1622
            bSig |= 0x20000000;
1623
        }
1624
        shift32RightJamming( bSig, expDiff, &bSig );
1625
        zExp = aExp;
1626
    }
1627
    else if ( expDiff < 0 ) {
1628
        if ( bExp == 0xFF ) {
1629
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630
            return packFloat32( zSign, 0xFF, 0 );
1631
        }
1632
        if ( aExp == 0 ) {
1633
            ++expDiff;
1634
        }
1635
        else {
1636
            aSig |= 0x20000000;
1637
        }
1638
        shift32RightJamming( aSig, - expDiff, &aSig );
1639
        zExp = bExp;
1640
    }
1641
    else {
1642
        if ( aExp == 0xFF ) {
1643
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644
            return a;
1645
        }
1646
        if ( aExp == 0 ) {
1647
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649
        }
1650
        zSig = 0x40000000 + aSig + bSig;
1651
        zExp = aExp;
1652
        goto roundAndPack;
1653
    }
1654
    aSig |= 0x20000000;
1655
    zSig = ( aSig + bSig )<<1;
1656
    --zExp;
1657
    if ( (sbits32) zSig < 0 ) {
1658
        zSig = aSig + bSig;
1659
        ++zExp;
1660
    }
1661
 roundAndPack:
1662
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663

    
1664
}
1665

    
1666
/*----------------------------------------------------------------------------
1667
| Returns the result of subtracting the absolute values of the single-
1668
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669
| difference is negated before being returned.  `zSign' is ignored if the
1670
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671
| Standard for Binary Floating-Point Arithmetic.
1672
*----------------------------------------------------------------------------*/
1673

    
1674
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675
{
1676
    int16 aExp, bExp, zExp;
1677
    bits32 aSig, bSig, zSig;
1678
    int16 expDiff;
1679

    
1680
    aSig = extractFloat32Frac( a );
1681
    aExp = extractFloat32Exp( a );
1682
    bSig = extractFloat32Frac( b );
1683
    bExp = extractFloat32Exp( b );
1684
    expDiff = aExp - bExp;
1685
    aSig <<= 7;
1686
    bSig <<= 7;
1687
    if ( 0 < expDiff ) goto aExpBigger;
1688
    if ( expDiff < 0 ) goto bExpBigger;
1689
    if ( aExp == 0xFF ) {
1690
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691
        float_raise( float_flag_invalid STATUS_VAR);
1692
        return float32_default_nan;
1693
    }
1694
    if ( aExp == 0 ) {
1695
        aExp = 1;
1696
        bExp = 1;
1697
    }
1698
    if ( bSig < aSig ) goto aBigger;
1699
    if ( aSig < bSig ) goto bBigger;
1700
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701
 bExpBigger:
1702
    if ( bExp == 0xFF ) {
1703
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1705
    }
1706
    if ( aExp == 0 ) {
1707
        ++expDiff;
1708
    }
1709
    else {
1710
        aSig |= 0x40000000;
1711
    }
1712
    shift32RightJamming( aSig, - expDiff, &aSig );
1713
    bSig |= 0x40000000;
1714
 bBigger:
1715
    zSig = bSig - aSig;
1716
    zExp = bExp;
1717
    zSign ^= 1;
1718
    goto normalizeRoundAndPack;
1719
 aExpBigger:
1720
    if ( aExp == 0xFF ) {
1721
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722
        return a;
1723
    }
1724
    if ( bExp == 0 ) {
1725
        --expDiff;
1726
    }
1727
    else {
1728
        bSig |= 0x40000000;
1729
    }
1730
    shift32RightJamming( bSig, expDiff, &bSig );
1731
    aSig |= 0x40000000;
1732
 aBigger:
1733
    zSig = aSig - bSig;
1734
    zExp = aExp;
1735
 normalizeRoundAndPack:
1736
    --zExp;
1737
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738

    
1739
}
1740

    
1741
/*----------------------------------------------------------------------------
1742
| Returns the result of adding the single-precision floating-point values `a'
1743
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744
| Binary Floating-Point Arithmetic.
1745
*----------------------------------------------------------------------------*/
1746

    
1747
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748
{
1749
    flag aSign, bSign;
1750

    
1751
    aSign = extractFloat32Sign( a );
1752
    bSign = extractFloat32Sign( b );
1753
    if ( aSign == bSign ) {
1754
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755
    }
1756
    else {
1757
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758
    }
1759

    
1760
}
1761

    
1762
/*----------------------------------------------------------------------------
1763
| Returns the result of subtracting the single-precision floating-point values
1764
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765
| for Binary Floating-Point Arithmetic.
1766
*----------------------------------------------------------------------------*/
1767

    
1768
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769
{
1770
    flag aSign, bSign;
1771

    
1772
    aSign = extractFloat32Sign( a );
1773
    bSign = extractFloat32Sign( b );
1774
    if ( aSign == bSign ) {
1775
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776
    }
1777
    else {
1778
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779
    }
1780

    
1781
}
1782

    
1783
/*----------------------------------------------------------------------------
1784
| Returns the result of multiplying the single-precision floating-point values
1785
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786
| for Binary Floating-Point Arithmetic.
1787
*----------------------------------------------------------------------------*/
1788

    
1789
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790
{
1791
    flag aSign, bSign, zSign;
1792
    int16 aExp, bExp, zExp;
1793
    bits32 aSig, bSig;
1794
    bits64 zSig64;
1795
    bits32 zSig;
1796

    
1797
    aSig = extractFloat32Frac( a );
1798
    aExp = extractFloat32Exp( a );
1799
    aSign = extractFloat32Sign( a );
1800
    bSig = extractFloat32Frac( b );
1801
    bExp = extractFloat32Exp( b );
1802
    bSign = extractFloat32Sign( b );
1803
    zSign = aSign ^ bSign;
1804
    if ( aExp == 0xFF ) {
1805
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806
            return propagateFloat32NaN( a, b STATUS_VAR );
1807
        }
1808
        if ( ( bExp | bSig ) == 0 ) {
1809
            float_raise( float_flag_invalid STATUS_VAR);
1810
            return float32_default_nan;
1811
        }
1812
        return packFloat32( zSign, 0xFF, 0 );
1813
    }
1814
    if ( bExp == 0xFF ) {
1815
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816
        if ( ( aExp | aSig ) == 0 ) {
1817
            float_raise( float_flag_invalid STATUS_VAR);
1818
            return float32_default_nan;
1819
        }
1820
        return packFloat32( zSign, 0xFF, 0 );
1821
    }
1822
    if ( aExp == 0 ) {
1823
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825
    }
1826
    if ( bExp == 0 ) {
1827
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829
    }
1830
    zExp = aExp + bExp - 0x7F;
1831
    aSig = ( aSig | 0x00800000 )<<7;
1832
    bSig = ( bSig | 0x00800000 )<<8;
1833
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834
    zSig = zSig64;
1835
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836
        zSig <<= 1;
1837
        --zExp;
1838
    }
1839
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840

    
1841
}
1842

    
1843
/*----------------------------------------------------------------------------
1844
| Returns the result of dividing the single-precision floating-point value `a'
1845
| by the corresponding value `b'.  The operation is performed according to the
1846
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847
*----------------------------------------------------------------------------*/
1848

    
1849
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850
{
1851
    flag aSign, bSign, zSign;
1852
    int16 aExp, bExp, zExp;
1853
    bits32 aSig, bSig, zSig;
1854

    
1855
    aSig = extractFloat32Frac( a );
1856
    aExp = extractFloat32Exp( a );
1857
    aSign = extractFloat32Sign( a );
1858
    bSig = extractFloat32Frac( b );
1859
    bExp = extractFloat32Exp( b );
1860
    bSign = extractFloat32Sign( b );
1861
    zSign = aSign ^ bSign;
1862
    if ( aExp == 0xFF ) {
1863
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864
        if ( bExp == 0xFF ) {
1865
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866
            float_raise( float_flag_invalid STATUS_VAR);
1867
            return float32_default_nan;
1868
        }
1869
        return packFloat32( zSign, 0xFF, 0 );
1870
    }
1871
    if ( bExp == 0xFF ) {
1872
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873
        return packFloat32( zSign, 0, 0 );
1874
    }
1875
    if ( bExp == 0 ) {
1876
        if ( bSig == 0 ) {
1877
            if ( ( aExp | aSig ) == 0 ) {
1878
                float_raise( float_flag_invalid STATUS_VAR);
1879
                return float32_default_nan;
1880
            }
1881
            float_raise( float_flag_divbyzero STATUS_VAR);
1882
            return packFloat32( zSign, 0xFF, 0 );
1883
        }
1884
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885
    }
1886
    if ( aExp == 0 ) {
1887
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889
    }
1890
    zExp = aExp - bExp + 0x7D;
1891
    aSig = ( aSig | 0x00800000 )<<7;
1892
    bSig = ( bSig | 0x00800000 )<<8;
1893
    if ( bSig <= ( aSig + aSig ) ) {
1894
        aSig >>= 1;
1895
        ++zExp;
1896
    }
1897
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898
    if ( ( zSig & 0x3F ) == 0 ) {
1899
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900
    }
1901
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902

    
1903
}
1904

    
1905
/*----------------------------------------------------------------------------
1906
| Returns the remainder of the single-precision floating-point value `a'
1907
| with respect to the corresponding value `b'.  The operation is performed
1908
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909
*----------------------------------------------------------------------------*/
1910

    
1911
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912
{
1913
    flag aSign, bSign, zSign;
1914
    int16 aExp, bExp, expDiff;
1915
    bits32 aSig, bSig;
1916
    bits32 q;
1917
    bits64 aSig64, bSig64, q64;
1918
    bits32 alternateASig;
1919
    sbits32 sigMean;
1920

    
1921
    aSig = extractFloat32Frac( a );
1922
    aExp = extractFloat32Exp( a );
1923
    aSign = extractFloat32Sign( a );
1924
    bSig = extractFloat32Frac( b );
1925
    bExp = extractFloat32Exp( b );
1926
    bSign = extractFloat32Sign( b );
1927
    if ( aExp == 0xFF ) {
1928
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1929
            return propagateFloat32NaN( a, b STATUS_VAR );
1930
        }
1931
        float_raise( float_flag_invalid STATUS_VAR);
1932
        return float32_default_nan;
1933
    }
1934
    if ( bExp == 0xFF ) {
1935
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936
        return a;
1937
    }
1938
    if ( bExp == 0 ) {
1939
        if ( bSig == 0 ) {
1940
            float_raise( float_flag_invalid STATUS_VAR);
1941
            return float32_default_nan;
1942
        }
1943
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1944
    }
1945
    if ( aExp == 0 ) {
1946
        if ( aSig == 0 ) return a;
1947
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1948
    }
1949
    expDiff = aExp - bExp;
1950
    aSig |= 0x00800000;
1951
    bSig |= 0x00800000;
1952
    if ( expDiff < 32 ) {
1953
        aSig <<= 8;
1954
        bSig <<= 8;
1955
        if ( expDiff < 0 ) {
1956
            if ( expDiff < -1 ) return a;
1957
            aSig >>= 1;
1958
        }
1959
        q = ( bSig <= aSig );
1960
        if ( q ) aSig -= bSig;
1961
        if ( 0 < expDiff ) {
1962
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1963
            q >>= 32 - expDiff;
1964
            bSig >>= 2;
1965
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1966
        }
1967
        else {
1968
            aSig >>= 2;
1969
            bSig >>= 2;
1970
        }
1971
    }
1972
    else {
1973
        if ( bSig <= aSig ) aSig -= bSig;
1974
        aSig64 = ( (bits64) aSig )<<40;
1975
        bSig64 = ( (bits64) bSig )<<40;
1976
        expDiff -= 64;
1977
        while ( 0 < expDiff ) {
1978
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980
            aSig64 = - ( ( bSig * q64 )<<38 );
1981
            expDiff -= 62;
1982
        }
1983
        expDiff += 64;
1984
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986
        q = q64>>( 64 - expDiff );
1987
        bSig <<= 6;
1988
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1989
    }
1990
    do {
1991
        alternateASig = aSig;
1992
        ++q;
1993
        aSig -= bSig;
1994
    } while ( 0 <= (sbits32) aSig );
1995
    sigMean = aSig + alternateASig;
1996
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997
        aSig = alternateASig;
1998
    }
1999
    zSign = ( (sbits32) aSig < 0 );
2000
    if ( zSign ) aSig = - aSig;
2001
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2002

    
2003
}
2004

    
2005
/*----------------------------------------------------------------------------
2006
| Returns the square root of the single-precision floating-point value `a'.
2007
| The operation is performed according to the IEC/IEEE Standard for Binary
2008
| Floating-Point Arithmetic.
2009
*----------------------------------------------------------------------------*/
2010

    
2011
float32 float32_sqrt( float32 a STATUS_PARAM )
2012
{
2013
    flag aSign;
2014
    int16 aExp, zExp;
2015
    bits32 aSig, zSig;
2016
    bits64 rem, term;
2017

    
2018
    aSig = extractFloat32Frac( a );
2019
    aExp = extractFloat32Exp( a );
2020
    aSign = extractFloat32Sign( a );
2021
    if ( aExp == 0xFF ) {
2022
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023
        if ( ! aSign ) return a;
2024
        float_raise( float_flag_invalid STATUS_VAR);
2025
        return float32_default_nan;
2026
    }
2027
    if ( aSign ) {
2028
        if ( ( aExp | aSig ) == 0 ) return a;
2029
        float_raise( float_flag_invalid STATUS_VAR);
2030
        return float32_default_nan;
2031
    }
2032
    if ( aExp == 0 ) {
2033
        if ( aSig == 0 ) return float32_zero;
2034
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2035
    }
2036
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037
    aSig = ( aSig | 0x00800000 )<<8;
2038
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2039
    if ( ( zSig & 0x7F ) <= 5 ) {
2040
        if ( zSig < 2 ) {
2041
            zSig = 0x7FFFFFFF;
2042
            goto roundAndPack;
2043
        }
2044
        aSig >>= aExp & 1;
2045
        term = ( (bits64) zSig ) * zSig;
2046
        rem = ( ( (bits64) aSig )<<32 ) - term;
2047
        while ( (sbits64) rem < 0 ) {
2048
            --zSig;
2049
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2050
        }
2051
        zSig |= ( rem != 0 );
2052
    }
2053
    shift32RightJamming( zSig, 1, &zSig );
2054
 roundAndPack:
2055
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2056

    
2057
}
2058

    
2059
/*----------------------------------------------------------------------------
2060
| Returns the binary log of the single-precision floating-point value `a'.
2061
| The operation is performed according to the IEC/IEEE Standard for Binary
2062
| Floating-Point Arithmetic.
2063
*----------------------------------------------------------------------------*/
2064
float32 float32_log2( float32 a STATUS_PARAM )
2065
{
2066
    flag aSign, zSign;
2067
    int16 aExp;
2068
    bits32 aSig, zSig, i;
2069

    
2070
    aSig = extractFloat32Frac( a );
2071
    aExp = extractFloat32Exp( a );
2072
    aSign = extractFloat32Sign( a );
2073

    
2074
    if ( aExp == 0 ) {
2075
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2076
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2077
    }
2078
    if ( aSign ) {
2079
        float_raise( float_flag_invalid STATUS_VAR);
2080
        return float32_default_nan;
2081
    }
2082
    if ( aExp == 0xFF ) {
2083
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2084
        return a;
2085
    }
2086

    
2087
    aExp -= 0x7F;
2088
    aSig |= 0x00800000;
2089
    zSign = aExp < 0;
2090
    zSig = aExp << 23;
2091

    
2092
    for (i = 1 << 22; i > 0; i >>= 1) {
2093
        aSig = ( (bits64)aSig * aSig ) >> 23;
2094
        if ( aSig & 0x01000000 ) {
2095
            aSig >>= 1;
2096
            zSig |= i;
2097
        }
2098
    }
2099

    
2100
    if ( zSign )
2101
        zSig = -zSig;
2102

    
2103
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2104
}
2105

    
2106
/*----------------------------------------------------------------------------
2107
| Returns 1 if the single-precision floating-point value `a' is equal to
2108
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2109
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2110
*----------------------------------------------------------------------------*/
2111

    
2112
int float32_eq( float32 a, float32 b STATUS_PARAM )
2113
{
2114

    
2115
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117
       ) {
2118
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119
            float_raise( float_flag_invalid STATUS_VAR);
2120
        }
2121
        return 0;
2122
    }
2123
    return ( float32_val(a) == float32_val(b) ) ||
2124
            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2125

    
2126
}
2127

    
2128
/*----------------------------------------------------------------------------
2129
| Returns 1 if the single-precision floating-point value `a' is less than
2130
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2131
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2132
| Arithmetic.
2133
*----------------------------------------------------------------------------*/
2134

    
2135
int float32_le( float32 a, float32 b STATUS_PARAM )
2136
{
2137
    flag aSign, bSign;
2138
    bits32 av, bv;
2139

    
2140
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2141
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2142
       ) {
2143
        float_raise( float_flag_invalid STATUS_VAR);
2144
        return 0;
2145
    }
2146
    aSign = extractFloat32Sign( a );
2147
    bSign = extractFloat32Sign( b );
2148
    av = float32_val(a);
2149
    bv = float32_val(b);
2150
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2151
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2152

    
2153
}
2154

    
2155
/*----------------------------------------------------------------------------
2156
| Returns 1 if the single-precision floating-point value `a' is less than
2157
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2158
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2159
*----------------------------------------------------------------------------*/
2160

    
2161
int float32_lt( float32 a, float32 b STATUS_PARAM )
2162
{
2163
    flag aSign, bSign;
2164
    bits32 av, bv;
2165

    
2166
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2167
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2168
       ) {
2169
        float_raise( float_flag_invalid STATUS_VAR);
2170
        return 0;
2171
    }
2172
    aSign = extractFloat32Sign( a );
2173
    bSign = extractFloat32Sign( b );
2174
    av = float32_val(a);
2175
    bv = float32_val(b);
2176
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2177
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2178

    
2179
}
2180

    
2181
/*----------------------------------------------------------------------------
2182
| Returns 1 if the single-precision floating-point value `a' is equal to
2183
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2184
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2185
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2186
*----------------------------------------------------------------------------*/
2187

    
2188
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2189
{
2190
    bits32 av, bv;
2191

    
2192
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2193
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194
       ) {
2195
        float_raise( float_flag_invalid STATUS_VAR);
2196
        return 0;
2197
    }
2198
    av = float32_val(a);
2199
    bv = float32_val(b);
2200
    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2201

    
2202
}
2203

    
2204
/*----------------------------------------------------------------------------
2205
| Returns 1 if the single-precision floating-point value `a' is less than or
2206
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2207
| cause an exception.  Otherwise, the comparison is performed according to the
2208
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2209
*----------------------------------------------------------------------------*/
2210

    
2211
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2212
{
2213
    flag aSign, bSign;
2214
    bits32 av, bv;
2215

    
2216
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2218
       ) {
2219
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2220
            float_raise( float_flag_invalid STATUS_VAR);
2221
        }
2222
        return 0;
2223
    }
2224
    aSign = extractFloat32Sign( a );
2225
    bSign = extractFloat32Sign( b );
2226
    av = float32_val(a);
2227
    bv = float32_val(b);
2228
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2229
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2230

    
2231
}
2232

    
2233
/*----------------------------------------------------------------------------
2234
| Returns 1 if the single-precision floating-point value `a' is less than
2235
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2236
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2237
| Standard for Binary Floating-Point Arithmetic.
2238
*----------------------------------------------------------------------------*/
2239

    
2240
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2241
{
2242
    flag aSign, bSign;
2243
    bits32 av, bv;
2244

    
2245
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2246
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2247
       ) {
2248
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2249
            float_raise( float_flag_invalid STATUS_VAR);
2250
        }
2251
        return 0;
2252
    }
2253
    aSign = extractFloat32Sign( a );
2254
    bSign = extractFloat32Sign( b );
2255
    av = float32_val(a);
2256
    bv = float32_val(b);
2257
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2258
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2259

    
2260
}
2261

    
2262
/*----------------------------------------------------------------------------
2263
| Returns the result of converting the double-precision floating-point value
2264
| `a' to the 32-bit two's complement integer format.  The conversion is
2265
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2266
| Arithmetic---which means in particular that the conversion is rounded
2267
| according to the current rounding mode.  If `a' is a NaN, the largest
2268
| positive integer is returned.  Otherwise, if the conversion overflows, the
2269
| largest integer with the same sign as `a' is returned.
2270
*----------------------------------------------------------------------------*/
2271

    
2272
int32 float64_to_int32( float64 a STATUS_PARAM )
2273
{
2274
    flag aSign;
2275
    int16 aExp, shiftCount;
2276
    bits64 aSig;
2277

    
2278
    aSig = extractFloat64Frac( a );
2279
    aExp = extractFloat64Exp( a );
2280
    aSign = extractFloat64Sign( a );
2281
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2282
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2283
    shiftCount = 0x42C - aExp;
2284
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2285
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2286

    
2287
}
2288

    
2289
/*----------------------------------------------------------------------------
2290
| Returns the result of converting the double-precision floating-point value
2291
| `a' to the 32-bit two's complement integer format.  The conversion is
2292
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2293
| Arithmetic, except that the conversion is always rounded toward zero.
2294
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2295
| the conversion overflows, the largest integer with the same sign as `a' is
2296
| returned.
2297
*----------------------------------------------------------------------------*/
2298

    
2299
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2300
{
2301
    flag aSign;
2302
    int16 aExp, shiftCount;
2303
    bits64 aSig, savedASig;
2304
    int32 z;
2305

    
2306
    aSig = extractFloat64Frac( a );
2307
    aExp = extractFloat64Exp( a );
2308
    aSign = extractFloat64Sign( a );
2309
    if ( 0x41E < aExp ) {
2310
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2311
        goto invalid;
2312
    }
2313
    else if ( aExp < 0x3FF ) {
2314
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315
        return 0;
2316
    }
2317
    aSig |= LIT64( 0x0010000000000000 );
2318
    shiftCount = 0x433 - aExp;
2319
    savedASig = aSig;
2320
    aSig >>= shiftCount;
2321
    z = aSig;
2322
    if ( aSign ) z = - z;
2323
    if ( ( z < 0 ) ^ aSign ) {
2324
 invalid:
2325
        float_raise( float_flag_invalid STATUS_VAR);
2326
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2327
    }
2328
    if ( ( aSig<<shiftCount ) != savedASig ) {
2329
        STATUS(float_exception_flags) |= float_flag_inexact;
2330
    }
2331
    return z;
2332

    
2333
}
2334

    
2335
/*----------------------------------------------------------------------------
2336
| Returns the result of converting the double-precision floating-point value
2337
| `a' to the 64-bit two's complement integer format.  The conversion is
2338
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2339
| Arithmetic---which means in particular that the conversion is rounded
2340
| according to the current rounding mode.  If `a' is a NaN, the largest
2341
| positive integer is returned.  Otherwise, if the conversion overflows, the
2342
| largest integer with the same sign as `a' is returned.
2343
*----------------------------------------------------------------------------*/
2344

    
2345
int64 float64_to_int64( float64 a STATUS_PARAM )
2346
{
2347
    flag aSign;
2348
    int16 aExp, shiftCount;
2349
    bits64 aSig, aSigExtra;
2350

    
2351
    aSig = extractFloat64Frac( a );
2352
    aExp = extractFloat64Exp( a );
2353
    aSign = extractFloat64Sign( a );
2354
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2355
    shiftCount = 0x433 - aExp;
2356
    if ( shiftCount <= 0 ) {
2357
        if ( 0x43E < aExp ) {
2358
            float_raise( float_flag_invalid STATUS_VAR);
2359
            if (    ! aSign
2360
                 || (    ( aExp == 0x7FF )
2361
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2362
               ) {
2363
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2364
            }
2365
            return (sbits64) LIT64( 0x8000000000000000 );
2366
        }
2367
        aSigExtra = 0;
2368
        aSig <<= - shiftCount;
2369
    }
2370
    else {
2371
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2372
    }
2373
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2374

    
2375
}
2376

    
2377
/*----------------------------------------------------------------------------
2378
| Returns the result of converting the double-precision floating-point value
2379
| `a' to the 64-bit two's complement integer format.  The conversion is
2380
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2381
| Arithmetic, except that the conversion is always rounded toward zero.
2382
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2383
| the conversion overflows, the largest integer with the same sign as `a' is
2384
| returned.
2385
*----------------------------------------------------------------------------*/
2386

    
2387
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2388
{
2389
    flag aSign;
2390
    int16 aExp, shiftCount;
2391
    bits64 aSig;
2392
    int64 z;
2393

    
2394
    aSig = extractFloat64Frac( a );
2395
    aExp = extractFloat64Exp( a );
2396
    aSign = extractFloat64Sign( a );
2397
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2398
    shiftCount = aExp - 0x433;
2399
    if ( 0 <= shiftCount ) {
2400
        if ( 0x43E <= aExp ) {
2401
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2402
                float_raise( float_flag_invalid STATUS_VAR);
2403
                if (    ! aSign
2404
                     || (    ( aExp == 0x7FF )
2405
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2406
                   ) {
2407
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2408
                }
2409
            }
2410
            return (sbits64) LIT64( 0x8000000000000000 );
2411
        }
2412
        z = aSig<<shiftCount;
2413
    }
2414
    else {
2415
        if ( aExp < 0x3FE ) {
2416
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2417
            return 0;
2418
        }
2419
        z = aSig>>( - shiftCount );
2420
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2421
            STATUS(float_exception_flags) |= float_flag_inexact;
2422
        }
2423
    }
2424
    if ( aSign ) z = - z;
2425
    return z;
2426

    
2427
}
2428

    
2429
/*----------------------------------------------------------------------------
2430
| Returns the result of converting the double-precision floating-point value
2431
| `a' to the single-precision floating-point format.  The conversion is
2432
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2433
| Arithmetic.
2434
*----------------------------------------------------------------------------*/
2435

    
2436
float32 float64_to_float32( float64 a STATUS_PARAM )
2437
{
2438
    flag aSign;
2439
    int16 aExp;
2440
    bits64 aSig;
2441
    bits32 zSig;
2442

    
2443
    aSig = extractFloat64Frac( a );
2444
    aExp = extractFloat64Exp( a );
2445
    aSign = extractFloat64Sign( a );
2446
    if ( aExp == 0x7FF ) {
2447
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2448
        return packFloat32( aSign, 0xFF, 0 );
2449
    }
2450
    shift64RightJamming( aSig, 22, &aSig );
2451
    zSig = aSig;
2452
    if ( aExp || zSig ) {
2453
        zSig |= 0x40000000;
2454
        aExp -= 0x381;
2455
    }
2456
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2457

    
2458
}
2459

    
2460
#ifdef FLOATX80
2461

    
2462
/*----------------------------------------------------------------------------
2463
| Returns the result of converting the double-precision floating-point value
2464
| `a' to the extended double-precision floating-point format.  The conversion
2465
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2466
| Arithmetic.
2467
*----------------------------------------------------------------------------*/
2468

    
2469
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2470
{
2471
    flag aSign;
2472
    int16 aExp;
2473
    bits64 aSig;
2474

    
2475
    aSig = extractFloat64Frac( a );
2476
    aExp = extractFloat64Exp( a );
2477
    aSign = extractFloat64Sign( a );
2478
    if ( aExp == 0x7FF ) {
2479
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2480
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2481
    }
2482
    if ( aExp == 0 ) {
2483
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2484
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2485
    }
2486
    return
2487
        packFloatx80(
2488
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2489

    
2490
}
2491

    
2492
#endif
2493

    
2494
#ifdef FLOAT128
2495

    
2496
/*----------------------------------------------------------------------------
2497
| Returns the result of converting the double-precision floating-point value
2498
| `a' to the quadruple-precision floating-point format.  The conversion is
2499
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2500
| Arithmetic.
2501
*----------------------------------------------------------------------------*/
2502

    
2503
float128 float64_to_float128( float64 a STATUS_PARAM )
2504
{
2505
    flag aSign;
2506
    int16 aExp;
2507
    bits64 aSig, zSig0, zSig1;
2508

    
2509
    aSig = extractFloat64Frac( a );
2510
    aExp = extractFloat64Exp( a );
2511
    aSign = extractFloat64Sign( a );
2512
    if ( aExp == 0x7FF ) {
2513
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2514
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2515
    }
2516
    if ( aExp == 0 ) {
2517
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2518
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2519
        --aExp;
2520
    }
2521
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2522
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2523

    
2524
}
2525

    
2526
#endif
2527

    
2528
/*----------------------------------------------------------------------------
2529
| Rounds the double-precision floating-point value `a' to an integer, and
2530
| returns the result as a double-precision floating-point value.  The
2531
| operation is performed according to the IEC/IEEE Standard for Binary
2532
| Floating-Point Arithmetic.
2533
*----------------------------------------------------------------------------*/
2534

    
2535
float64 float64_round_to_int( float64 a STATUS_PARAM )
2536
{
2537
    flag aSign;
2538
    int16 aExp;
2539
    bits64 lastBitMask, roundBitsMask;
2540
    int8 roundingMode;
2541
    bits64 z;
2542

    
2543
    aExp = extractFloat64Exp( a );
2544
    if ( 0x433 <= aExp ) {
2545
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2546
            return propagateFloat64NaN( a, a STATUS_VAR );
2547
        }
2548
        return a;
2549
    }
2550
    if ( aExp < 0x3FF ) {
2551
        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2552
        STATUS(float_exception_flags) |= float_flag_inexact;
2553
        aSign = extractFloat64Sign( a );
2554
        switch ( STATUS(float_rounding_mode) ) {
2555
         case float_round_nearest_even:
2556
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2557
                return packFloat64( aSign, 0x3FF, 0 );
2558
            }
2559
            break;
2560
         case float_round_down:
2561
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2562
         case float_round_up:
2563
            return make_float64(
2564
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2565
        }
2566
        return packFloat64( aSign, 0, 0 );
2567
    }
2568
    lastBitMask = 1;
2569
    lastBitMask <<= 0x433 - aExp;
2570
    roundBitsMask = lastBitMask - 1;
2571
    z = float64_val(a);
2572
    roundingMode = STATUS(float_rounding_mode);
2573
    if ( roundingMode == float_round_nearest_even ) {
2574
        z += lastBitMask>>1;
2575
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2576
    }
2577
    else if ( roundingMode != float_round_to_zero ) {
2578
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2579
            z += roundBitsMask;
2580
        }
2581
    }
2582
    z &= ~ roundBitsMask;
2583
    if ( z != float64_val(a) )
2584
        STATUS(float_exception_flags) |= float_flag_inexact;
2585
    return make_float64(z);
2586

    
2587
}
2588

    
2589
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2590
{
2591
    int oldmode;
2592
    float64 res;
2593
    oldmode = STATUS(float_rounding_mode);
2594
    STATUS(float_rounding_mode) = float_round_to_zero;
2595
    res = float64_round_to_int(a STATUS_VAR);
2596
    STATUS(float_rounding_mode) = oldmode;
2597
    return res;
2598
}
2599

    
2600
/*----------------------------------------------------------------------------
2601
| Returns the result of adding the absolute values of the double-precision
2602
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2603
| before being returned.  `zSign' is ignored if the result is a NaN.
2604
| The addition is performed according to the IEC/IEEE Standard for Binary
2605
| Floating-Point Arithmetic.
2606
*----------------------------------------------------------------------------*/
2607

    
2608
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2609
{
2610
    int16 aExp, bExp, zExp;
2611
    bits64 aSig, bSig, zSig;
2612
    int16 expDiff;
2613

    
2614
    aSig = extractFloat64Frac( a );
2615
    aExp = extractFloat64Exp( a );
2616
    bSig = extractFloat64Frac( b );
2617
    bExp = extractFloat64Exp( b );
2618
    expDiff = aExp - bExp;
2619
    aSig <<= 9;
2620
    bSig <<= 9;
2621
    if ( 0 < expDiff ) {
2622
        if ( aExp == 0x7FF ) {
2623
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624
            return a;
2625
        }
2626
        if ( bExp == 0 ) {
2627
            --expDiff;
2628
        }
2629
        else {
2630
            bSig |= LIT64( 0x2000000000000000 );
2631
        }
2632
        shift64RightJamming( bSig, expDiff, &bSig );
2633
        zExp = aExp;
2634
    }
2635
    else if ( expDiff < 0 ) {
2636
        if ( bExp == 0x7FF ) {
2637
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2638
            return packFloat64( zSign, 0x7FF, 0 );
2639
        }
2640
        if ( aExp == 0 ) {
2641
            ++expDiff;
2642
        }
2643
        else {
2644
            aSig |= LIT64( 0x2000000000000000 );
2645
        }
2646
        shift64RightJamming( aSig, - expDiff, &aSig );
2647
        zExp = bExp;
2648
    }
2649
    else {
2650
        if ( aExp == 0x7FF ) {
2651
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652
            return a;
2653
        }
2654
        if ( aExp == 0 ) {
2655
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2656
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2657
        }
2658
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2659
        zExp = aExp;
2660
        goto roundAndPack;
2661
    }
2662
    aSig |= LIT64( 0x2000000000000000 );
2663
    zSig = ( aSig + bSig )<<1;
2664
    --zExp;
2665
    if ( (sbits64) zSig < 0 ) {
2666
        zSig = aSig + bSig;
2667
        ++zExp;
2668
    }
2669
 roundAndPack:
2670
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2671

    
2672
}
2673

    
2674
/*----------------------------------------------------------------------------
2675
| Returns the result of subtracting the absolute values of the double-
2676
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2677
| difference is negated before being returned.  `zSign' is ignored if the
2678
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2679
| Standard for Binary Floating-Point Arithmetic.
2680
*----------------------------------------------------------------------------*/
2681

    
2682
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2683
{
2684
    int16 aExp, bExp, zExp;
2685
    bits64 aSig, bSig, zSig;
2686
    int16 expDiff;
2687

    
2688
    aSig = extractFloat64Frac( a );
2689
    aExp = extractFloat64Exp( a );
2690
    bSig = extractFloat64Frac( b );
2691
    bExp = extractFloat64Exp( b );
2692
    expDiff = aExp - bExp;
2693
    aSig <<= 10;
2694
    bSig <<= 10;
2695
    if ( 0 < expDiff ) goto aExpBigger;
2696
    if ( expDiff < 0 ) goto bExpBigger;
2697
    if ( aExp == 0x7FF ) {
2698
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2699
        float_raise( float_flag_invalid STATUS_VAR);
2700
        return float64_default_nan;
2701
    }
2702
    if ( aExp == 0 ) {
2703
        aExp = 1;
2704
        bExp = 1;
2705
    }
2706
    if ( bSig < aSig ) goto aBigger;
2707
    if ( aSig < bSig ) goto bBigger;
2708
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2709
 bExpBigger:
2710
    if ( bExp == 0x7FF ) {
2711
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2712
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2713
    }
2714
    if ( aExp == 0 ) {
2715
        ++expDiff;
2716
    }
2717
    else {
2718
        aSig |= LIT64( 0x4000000000000000 );
2719
    }
2720
    shift64RightJamming( aSig, - expDiff, &aSig );
2721
    bSig |= LIT64( 0x4000000000000000 );
2722
 bBigger:
2723
    zSig = bSig - aSig;
2724
    zExp = bExp;
2725
    zSign ^= 1;
2726
    goto normalizeRoundAndPack;
2727
 aExpBigger:
2728
    if ( aExp == 0x7FF ) {
2729
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2730
        return a;
2731
    }
2732
    if ( bExp == 0 ) {
2733
        --expDiff;
2734
    }
2735
    else {
2736
        bSig |= LIT64( 0x4000000000000000 );
2737
    }
2738
    shift64RightJamming( bSig, expDiff, &bSig );
2739
    aSig |= LIT64( 0x4000000000000000 );
2740
 aBigger:
2741
    zSig = aSig - bSig;
2742
    zExp = aExp;
2743
 normalizeRoundAndPack:
2744
    --zExp;
2745
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2746

    
2747
}
2748

    
2749
/*----------------------------------------------------------------------------
2750
| Returns the result of adding the double-precision floating-point values `a'
2751
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2752
| Binary Floating-Point Arithmetic.
2753
*----------------------------------------------------------------------------*/
2754

    
2755
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2756
{
2757
    flag aSign, bSign;
2758

    
2759
    aSign = extractFloat64Sign( a );
2760
    bSign = extractFloat64Sign( b );
2761
    if ( aSign == bSign ) {
2762
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2763
    }
2764
    else {
2765
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2766
    }
2767

    
2768
}
2769

    
2770
/*----------------------------------------------------------------------------
2771
| Returns the result of subtracting the double-precision floating-point values
2772
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2773
| for Binary Floating-Point Arithmetic.
2774
*----------------------------------------------------------------------------*/
2775

    
2776
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2777
{
2778
    flag aSign, bSign;
2779

    
2780
    aSign = extractFloat64Sign( a );
2781
    bSign = extractFloat64Sign( b );
2782
    if ( aSign == bSign ) {
2783
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2784
    }
2785
    else {
2786
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2787
    }
2788

    
2789
}
2790

    
2791
/*----------------------------------------------------------------------------
2792
| Returns the result of multiplying the double-precision floating-point values
2793
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2794
| for Binary Floating-Point Arithmetic.
2795
*----------------------------------------------------------------------------*/
2796

    
2797
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2798
{
2799
    flag aSign, bSign, zSign;
2800
    int16 aExp, bExp, zExp;
2801
    bits64 aSig, bSig, zSig0, zSig1;
2802

    
2803
    aSig = extractFloat64Frac( a );
2804
    aExp = extractFloat64Exp( a );
2805
    aSign = extractFloat64Sign( a );
2806
    bSig = extractFloat64Frac( b );
2807
    bExp = extractFloat64Exp( b );
2808
    bSign = extractFloat64Sign( b );
2809
    zSign = aSign ^ bSign;
2810
    if ( aExp == 0x7FF ) {
2811
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2812
            return propagateFloat64NaN( a, b STATUS_VAR );
2813
        }
2814
        if ( ( bExp | bSig ) == 0 ) {
2815
            float_raise( float_flag_invalid STATUS_VAR);
2816
            return float64_default_nan;
2817
        }
2818
        return packFloat64( zSign, 0x7FF, 0 );
2819
    }
2820
    if ( bExp == 0x7FF ) {
2821
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822
        if ( ( aExp | aSig ) == 0 ) {
2823
            float_raise( float_flag_invalid STATUS_VAR);
2824
            return float64_default_nan;
2825
        }
2826
        return packFloat64( zSign, 0x7FF, 0 );
2827
    }
2828
    if ( aExp == 0 ) {
2829
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2830
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2831
    }
2832
    if ( bExp == 0 ) {
2833
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2834
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835
    }
2836
    zExp = aExp + bExp - 0x3FF;
2837
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2838
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2839
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2840
    zSig0 |= ( zSig1 != 0 );
2841
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2842
        zSig0 <<= 1;
2843
        --zExp;
2844
    }
2845
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2846

    
2847
}
2848

    
2849
/*----------------------------------------------------------------------------
2850
| Returns the result of dividing the double-precision floating-point value `a'
2851
| by the corresponding value `b'.  The operation is performed according to
2852
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2853
*----------------------------------------------------------------------------*/
2854

    
2855
float64 float64_div( float64 a, float64 b STATUS_PARAM )
2856
{
2857
    flag aSign, bSign, zSign;
2858
    int16 aExp, bExp, zExp;
2859
    bits64 aSig, bSig, zSig;
2860
    bits64 rem0, rem1;
2861
    bits64 term0, term1;
2862

    
2863
    aSig = extractFloat64Frac( a );
2864
    aExp = extractFloat64Exp( a );
2865
    aSign = extractFloat64Sign( a );
2866
    bSig = extractFloat64Frac( b );
2867
    bExp = extractFloat64Exp( b );
2868
    bSign = extractFloat64Sign( b );
2869
    zSign = aSign ^ bSign;
2870
    if ( aExp == 0x7FF ) {
2871
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2872
        if ( bExp == 0x7FF ) {
2873
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2874
            float_raise( float_flag_invalid STATUS_VAR);
2875
            return float64_default_nan;
2876
        }
2877
        return packFloat64( zSign, 0x7FF, 0 );
2878
    }
2879
    if ( bExp == 0x7FF ) {
2880
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2881
        return packFloat64( zSign, 0, 0 );
2882
    }
2883
    if ( bExp == 0 ) {
2884
        if ( bSig == 0 ) {
2885
            if ( ( aExp | aSig ) == 0 ) {
2886
                float_raise( float_flag_invalid STATUS_VAR);
2887
                return float64_default_nan;
2888
            }
2889
            float_raise( float_flag_divbyzero STATUS_VAR);
2890
            return packFloat64( zSign, 0x7FF, 0 );
2891
        }
2892
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2893
    }
2894
    if ( aExp == 0 ) {
2895
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2896
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2897
    }
2898
    zExp = aExp - bExp + 0x3FD;
2899
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2900
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2901
    if ( bSig <= ( aSig + aSig ) ) {
2902
        aSig >>= 1;
2903
        ++zExp;
2904
    }
2905
    zSig = estimateDiv128To64( aSig, 0, bSig );
2906
    if ( ( zSig & 0x1FF ) <= 2 ) {
2907
        mul64To128( bSig, zSig, &term0, &term1 );
2908
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2909
        while ( (sbits64) rem0 < 0 ) {
2910
            --zSig;
2911
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2912
        }
2913
        zSig |= ( rem1 != 0 );
2914
    }
2915
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2916

    
2917
}
2918

    
2919
/*----------------------------------------------------------------------------
2920
| Returns the remainder of the double-precision floating-point value `a'
2921
| with respect to the corresponding value `b'.  The operation is performed
2922
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923
*----------------------------------------------------------------------------*/
2924

    
2925
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2926
{
2927
    flag aSign, bSign, zSign;
2928
    int16 aExp, bExp, expDiff;
2929
    bits64 aSig, bSig;
2930
    bits64 q, alternateASig;
2931
    sbits64 sigMean;
2932

    
2933
    aSig = extractFloat64Frac( a );
2934
    aExp = extractFloat64Exp( a );
2935
    aSign = extractFloat64Sign( a );
2936
    bSig = extractFloat64Frac( b );
2937
    bExp = extractFloat64Exp( b );
2938
    bSign = extractFloat64Sign( b );
2939
    if ( aExp == 0x7FF ) {
2940
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2941
            return propagateFloat64NaN( a, b STATUS_VAR );
2942
        }
2943
        float_raise( float_flag_invalid STATUS_VAR);
2944
        return float64_default_nan;
2945
    }
2946
    if ( bExp == 0x7FF ) {
2947
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2948
        return a;
2949
    }
2950
    if ( bExp == 0 ) {
2951
        if ( bSig == 0 ) {
2952
            float_raise( float_flag_invalid STATUS_VAR);
2953
            return float64_default_nan;
2954
        }
2955
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2956
    }
2957
    if ( aExp == 0 ) {
2958
        if ( aSig == 0 ) return a;
2959
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2960
    }
2961
    expDiff = aExp - bExp;
2962
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2963
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2964
    if ( expDiff < 0 ) {
2965
        if ( expDiff < -1 ) return a;
2966
        aSig >>= 1;
2967
    }
2968
    q = ( bSig <= aSig );
2969
    if ( q ) aSig -= bSig;
2970
    expDiff -= 64;
2971
    while ( 0 < expDiff ) {
2972
        q = estimateDiv128To64( aSig, 0, bSig );
2973
        q = ( 2 < q ) ? q - 2 : 0;
2974
        aSig = - ( ( bSig>>2 ) * q );
2975
        expDiff -= 62;
2976
    }
2977
    expDiff += 64;
2978
    if ( 0 < expDiff ) {
2979
        q = estimateDiv128To64( aSig, 0, bSig );
2980
        q = ( 2 < q ) ? q - 2 : 0;
2981
        q >>= 64 - expDiff;
2982
        bSig >>= 2;
2983
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2984
    }
2985
    else {
2986
        aSig >>= 2;
2987
        bSig >>= 2;
2988
    }
2989
    do {
2990
        alternateASig = aSig;
2991
        ++q;
2992
        aSig -= bSig;
2993
    } while ( 0 <= (sbits64) aSig );
2994
    sigMean = aSig + alternateASig;
2995
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2996
        aSig = alternateASig;
2997
    }
2998
    zSign = ( (sbits64) aSig < 0 );
2999
    if ( zSign ) aSig = - aSig;
3000
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3001

    
3002
}
3003

    
3004
/*----------------------------------------------------------------------------
3005
| Returns the square root of the double-precision floating-point value `a'.
3006
| The operation is performed according to the IEC/IEEE Standard for Binary
3007
| Floating-Point Arithmetic.
3008
*----------------------------------------------------------------------------*/
3009

    
3010
float64 float64_sqrt( float64 a STATUS_PARAM )
3011
{
3012
    flag aSign;
3013
    int16 aExp, zExp;
3014
    bits64 aSig, zSig, doubleZSig;
3015
    bits64 rem0, rem1, term0, term1;
3016

    
3017
    aSig = extractFloat64Frac( a );
3018
    aExp = extractFloat64Exp( a );
3019
    aSign = extractFloat64Sign( a );
3020
    if ( aExp == 0x7FF ) {
3021
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3022
        if ( ! aSign ) return a;
3023
        float_raise( float_flag_invalid STATUS_VAR);
3024
        return float64_default_nan;
3025
    }
3026
    if ( aSign ) {
3027
        if ( ( aExp | aSig ) == 0 ) return a;
3028
        float_raise( float_flag_invalid STATUS_VAR);
3029
        return float64_default_nan;
3030
    }
3031
    if ( aExp == 0 ) {
3032
        if ( aSig == 0 ) return float64_zero;
3033
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034
    }
3035
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3036
    aSig |= LIT64( 0x0010000000000000 );
3037
    zSig = estimateSqrt32( aExp, aSig>>21 );
3038
    aSig <<= 9 - ( aExp & 1 );
3039
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3040
    if ( ( zSig & 0x1FF ) <= 5 ) {
3041
        doubleZSig = zSig<<1;
3042
        mul64To128( zSig, zSig, &term0, &term1 );
3043
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3044
        while ( (sbits64) rem0 < 0 ) {
3045
            --zSig;
3046
            doubleZSig -= 2;
3047
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3048
        }
3049
        zSig |= ( ( rem0 | rem1 ) != 0 );
3050
    }
3051
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3052

    
3053
}
3054

    
3055
/*----------------------------------------------------------------------------
3056
| Returns the binary log of the double-precision floating-point value `a'.
3057
| The operation is performed according to the IEC/IEEE Standard for Binary
3058
| Floating-Point Arithmetic.
3059
*----------------------------------------------------------------------------*/
3060
float64 float64_log2( float64 a STATUS_PARAM )
3061
{
3062
    flag aSign, zSign;
3063
    int16 aExp;
3064
    bits64 aSig, aSig0, aSig1, zSig, i;
3065

    
3066
    aSig = extractFloat64Frac( a );
3067
    aExp = extractFloat64Exp( a );
3068
    aSign = extractFloat64Sign( a );
3069

    
3070
    if ( aExp == 0 ) {
3071
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3072
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3073
    }
3074
    if ( aSign ) {
3075
        float_raise( float_flag_invalid STATUS_VAR);
3076
        return float64_default_nan;
3077
    }
3078
    if ( aExp == 0x7FF ) {
3079
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3080
        return a;
3081
    }
3082

    
3083
    aExp -= 0x3FF;
3084
    aSig |= LIT64( 0x0010000000000000 );
3085
    zSign = aExp < 0;
3086
    zSig = (bits64)aExp << 52;
3087
    for (i = 1LL << 51; i > 0; i >>= 1) {
3088
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3089
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3090
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3091
            aSig >>= 1;
3092
            zSig |= i;
3093
        }
3094
    }
3095

    
3096
    if ( zSign )
3097
        zSig = -zSig;
3098
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3099
}
3100

    
3101
/*----------------------------------------------------------------------------
3102
| Returns 1 if the double-precision floating-point value `a' is equal to the
3103
| corresponding value `b', and 0 otherwise.  The comparison is performed
3104
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3105
*----------------------------------------------------------------------------*/
3106

    
3107
int float64_eq( float64 a, float64 b STATUS_PARAM )
3108
{
3109
    bits64 av, bv;
3110

    
3111
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3113
       ) {
3114
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3115
            float_raise( float_flag_invalid STATUS_VAR);
3116
        }
3117
        return 0;
3118
    }
3119
    av = float64_val(a);
3120
    bv = float64_val(b);
3121
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3122

    
3123
}
3124

    
3125
/*----------------------------------------------------------------------------
3126
| Returns 1 if the double-precision floating-point value `a' is less than or
3127
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3128
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3129
| Arithmetic.
3130
*----------------------------------------------------------------------------*/
3131

    
3132
int float64_le( float64 a, float64 b STATUS_PARAM )
3133
{
3134
    flag aSign, bSign;
3135
    bits64 av, bv;
3136

    
3137
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139
       ) {
3140
        float_raise( float_flag_invalid STATUS_VAR);
3141
        return 0;
3142
    }
3143
    aSign = extractFloat64Sign( a );
3144
    bSign = extractFloat64Sign( b );
3145
    av = float64_val(a);
3146
    bv = float64_val(b);
3147
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3148
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3149

    
3150
}
3151

    
3152
/*----------------------------------------------------------------------------
3153
| Returns 1 if the double-precision floating-point value `a' is less than
3154
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3155
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3156
*----------------------------------------------------------------------------*/
3157

    
3158
int float64_lt( float64 a, float64 b STATUS_PARAM )
3159
{
3160
    flag aSign, bSign;
3161
    bits64 av, bv;
3162

    
3163
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3164
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3165
       ) {
3166
        float_raise( float_flag_invalid STATUS_VAR);
3167
        return 0;
3168
    }
3169
    aSign = extractFloat64Sign( a );
3170
    bSign = extractFloat64Sign( b );
3171
    av = float64_val(a);
3172
    bv = float64_val(b);
3173
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3174
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3175

    
3176
}
3177

    
3178
/*----------------------------------------------------------------------------
3179
| Returns 1 if the double-precision floating-point value `a' is equal to the
3180
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3181
| if either operand is a NaN.  Otherwise, the comparison is performed
3182
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3183
*----------------------------------------------------------------------------*/
3184

    
3185
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3186
{
3187
    bits64 av, bv;
3188

    
3189
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3190
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3191
       ) {
3192
        float_raise( float_flag_invalid STATUS_VAR);
3193
        return 0;
3194
    }
3195
    av = float64_val(a);
3196
    bv = float64_val(b);
3197
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3198

    
3199
}
3200

    
3201
/*----------------------------------------------------------------------------
3202
| Returns 1 if the double-precision floating-point value `a' is less than or
3203
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3204
| cause an exception.  Otherwise, the comparison is performed according to the
3205
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3206
*----------------------------------------------------------------------------*/
3207

    
3208
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3209
{
3210
    flag aSign, bSign;
3211
    bits64 av, bv;
3212

    
3213
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3214
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3215
       ) {
3216
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3217
            float_raise( float_flag_invalid STATUS_VAR);
3218
        }
3219
        return 0;
3220
    }
3221
    aSign = extractFloat64Sign( a );
3222
    bSign = extractFloat64Sign( b );
3223
    av = float64_val(a);
3224
    bv = float64_val(b);
3225
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3226
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3227

    
3228
}
3229

    
3230
/*----------------------------------------------------------------------------
3231
| Returns 1 if the double-precision floating-point value `a' is less than
3232
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3233
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3234
| Standard for Binary Floating-Point Arithmetic.
3235
*----------------------------------------------------------------------------*/
3236

    
3237
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3238
{
3239
    flag aSign, bSign;
3240
    bits64 av, bv;
3241

    
3242
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3243
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3244
       ) {
3245
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3246
            float_raise( float_flag_invalid STATUS_VAR);
3247
        }
3248
        return 0;
3249
    }
3250
    aSign = extractFloat64Sign( a );
3251
    bSign = extractFloat64Sign( b );
3252
    av = float64_val(a);
3253
    bv = float64_val(b);
3254
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3255
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3256

    
3257
}
3258

    
3259
#ifdef FLOATX80
3260

    
3261
/*----------------------------------------------------------------------------
3262
| Returns the result of converting the extended double-precision floating-
3263
| point value `a' to the 32-bit two's complement integer format.  The
3264
| conversion is performed according to the IEC/IEEE Standard for Binary
3265
| Floating-Point Arithmetic---which means in particular that the conversion
3266
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3267
| largest positive integer is returned.  Otherwise, if the conversion
3268
| overflows, the largest integer with the same sign as `a' is returned.
3269
*----------------------------------------------------------------------------*/
3270

    
3271
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3272
{
3273
    flag aSign;
3274
    int32 aExp, shiftCount;
3275
    bits64 aSig;
3276

    
3277
    aSig = extractFloatx80Frac( a );
3278
    aExp = extractFloatx80Exp( a );
3279
    aSign = extractFloatx80Sign( a );
3280
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3281
    shiftCount = 0x4037 - aExp;
3282
    if ( shiftCount <= 0 ) shiftCount = 1;
3283
    shift64RightJamming( aSig, shiftCount, &aSig );
3284
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3285

    
3286
}
3287

    
3288
/*----------------------------------------------------------------------------
3289
| Returns the result of converting the extended double-precision floating-
3290
| point value `a' to the 32-bit two's complement integer format.  The
3291
| conversion is performed according to the IEC/IEEE Standard for Binary
3292
| Floating-Point Arithmetic, except that the conversion is always rounded
3293
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3294
| Otherwise, if the conversion overflows, the largest integer with the same
3295
| sign as `a' is returned.
3296
*----------------------------------------------------------------------------*/
3297

    
3298
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3299
{
3300
    flag aSign;
3301
    int32 aExp, shiftCount;
3302
    bits64 aSig, savedASig;
3303
    int32 z;
3304

    
3305
    aSig = extractFloatx80Frac( a );
3306
    aExp = extractFloatx80Exp( a );
3307
    aSign = extractFloatx80Sign( a );
3308
    if ( 0x401E < aExp ) {
3309
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3310
        goto invalid;
3311
    }
3312
    else if ( aExp < 0x3FFF ) {
3313
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3314
        return 0;
3315
    }
3316
    shiftCount = 0x403E - aExp;
3317
    savedASig = aSig;
3318
    aSig >>= shiftCount;
3319
    z = aSig;
3320
    if ( aSign ) z = - z;
3321
    if ( ( z < 0 ) ^ aSign ) {
3322
 invalid:
3323
        float_raise( float_flag_invalid STATUS_VAR);
3324
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3325
    }
3326
    if ( ( aSig<<shiftCount ) != savedASig ) {
3327
        STATUS(float_exception_flags) |= float_flag_inexact;
3328
    }
3329
    return z;
3330

    
3331
}
3332

    
3333
/*----------------------------------------------------------------------------
3334
| Returns the result of converting the extended double-precision floating-
3335
| point value `a' to the 64-bit two's complement integer format.  The
3336
| conversion is performed according to the IEC/IEEE Standard for Binary
3337
| Floating-Point Arithmetic---which means in particular that the conversion
3338
| is rounded according to the current rounding mode.  If `a' is a NaN,
3339
| the largest positive integer is returned.  Otherwise, if the conversion
3340
| overflows, the largest integer with the same sign as `a' is returned.
3341
*----------------------------------------------------------------------------*/
3342

    
3343
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3344
{
3345
    flag aSign;
3346
    int32 aExp, shiftCount;
3347
    bits64 aSig, aSigExtra;
3348

    
3349
    aSig = extractFloatx80Frac( a );
3350
    aExp = extractFloatx80Exp( a );
3351
    aSign = extractFloatx80Sign( a );
3352
    shiftCount = 0x403E - aExp;
3353
    if ( shiftCount <= 0 ) {
3354
        if ( shiftCount ) {
3355
            float_raise( float_flag_invalid STATUS_VAR);
3356
            if (    ! aSign
3357
                 || (    ( aExp == 0x7FFF )
3358
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3359
               ) {
3360
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3361
            }
3362
            return (sbits64) LIT64( 0x8000000000000000 );
3363
        }
3364
        aSigExtra = 0;
3365
    }
3366
    else {
3367
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3368
    }
3369
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3370

    
3371
}
3372

    
3373
/*----------------------------------------------------------------------------
3374
| Returns the result of converting the extended double-precision floating-
3375
| point value `a' to the 64-bit two's complement integer format.  The
3376
| conversion is performed according to the IEC/IEEE Standard for Binary
3377
| Floating-Point Arithmetic, except that the conversion is always rounded
3378
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3379
| Otherwise, if the conversion overflows, the largest integer with the same
3380
| sign as `a' is returned.
3381
*----------------------------------------------------------------------------*/
3382

    
3383
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3384
{
3385
    flag aSign;
3386
    int32 aExp, shiftCount;
3387
    bits64 aSig;
3388
    int64 z;
3389

    
3390
    aSig = extractFloatx80Frac( a );
3391
    aExp = extractFloatx80Exp( a );
3392
    aSign = extractFloatx80Sign( a );
3393
    shiftCount = aExp - 0x403E;
3394
    if ( 0 <= shiftCount ) {
3395
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3396
        if ( ( a.high != 0xC03E ) || aSig ) {
3397
            float_raise( float_flag_invalid STATUS_VAR);
3398
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3399
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3400
            }
3401
        }
3402
        return (sbits64) LIT64( 0x8000000000000000 );
3403
    }
3404
    else if ( aExp < 0x3FFF ) {
3405
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3406
        return 0;
3407
    }
3408
    z = aSig>>( - shiftCount );
3409
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3410
        STATUS(float_exception_flags) |= float_flag_inexact;
3411
    }
3412
    if ( aSign ) z = - z;
3413
    return z;
3414

    
3415
}
3416

    
3417
/*----------------------------------------------------------------------------
3418
| Returns the result of converting the extended double-precision floating-
3419
| point value `a' to the single-precision floating-point format.  The
3420
| conversion is performed according to the IEC/IEEE Standard for Binary
3421
| Floating-Point Arithmetic.
3422
*----------------------------------------------------------------------------*/
3423

    
3424
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3425
{
3426
    flag aSign;
3427
    int32 aExp;
3428
    bits64 aSig;
3429

    
3430
    aSig = extractFloatx80Frac( a );
3431
    aExp = extractFloatx80Exp( a );
3432
    aSign = extractFloatx80Sign( a );
3433
    if ( aExp == 0x7FFF ) {
3434
        if ( (bits64) ( aSig<<1 ) ) {
3435
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3436
        }
3437
        return packFloat32( aSign, 0xFF, 0 );
3438
    }
3439
    shift64RightJamming( aSig, 33, &aSig );
3440
    if ( aExp || aSig ) aExp -= 0x3F81;
3441
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3442

    
3443
}
3444

    
3445
/*----------------------------------------------------------------------------
3446
| Returns the result of converting the extended double-precision floating-
3447
| point value `a' to the double-precision floating-point format.  The
3448
| conversion is performed according to the IEC/IEEE Standard for Binary
3449
| Floating-Point Arithmetic.
3450
*----------------------------------------------------------------------------*/
3451

    
3452
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3453
{
3454
    flag aSign;
3455
    int32 aExp;
3456
    bits64 aSig, zSig;
3457

    
3458
    aSig = extractFloatx80Frac( a );
3459
    aExp = extractFloatx80Exp( a );
3460
    aSign = extractFloatx80Sign( a );
3461
    if ( aExp == 0x7FFF ) {
3462
        if ( (bits64) ( aSig<<1 ) ) {
3463
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3464
        }
3465
        return packFloat64( aSign, 0x7FF, 0 );
3466
    }
3467
    shift64RightJamming( aSig, 1, &zSig );
3468
    if ( aExp || aSig ) aExp -= 0x3C01;
3469
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3470

    
3471
}
3472

    
3473
#ifdef FLOAT128
3474

    
3475
/*----------------------------------------------------------------------------
3476
| Returns the result of converting the extended double-precision floating-
3477
| point value `a' to the quadruple-precision floating-point format.  The
3478
| conversion is performed according to the IEC/IEEE Standard for Binary
3479
| Floating-Point Arithmetic.
3480
*----------------------------------------------------------------------------*/
3481

    
3482
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3483
{
3484
    flag aSign;
3485
    int16 aExp;
3486
    bits64 aSig, zSig0, zSig1;
3487

    
3488
    aSig = extractFloatx80Frac( a );
3489
    aExp = extractFloatx80Exp( a );
3490
    aSign = extractFloatx80Sign( a );
3491
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3492
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3493
    }
3494
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3495
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3496

    
3497
}
3498

    
3499
#endif
3500

    
3501
/*----------------------------------------------------------------------------
3502
| Rounds the extended double-precision floating-point value `a' to an integer,
3503
| and returns the result as an extended quadruple-precision floating-point
3504
| value.  The operation is performed according to the IEC/IEEE Standard for
3505
| Binary Floating-Point Arithmetic.
3506
*----------------------------------------------------------------------------*/
3507

    
3508
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3509
{
3510
    flag aSign;
3511
    int32 aExp;
3512
    bits64 lastBitMask, roundBitsMask;
3513
    int8 roundingMode;
3514
    floatx80 z;
3515

    
3516
    aExp = extractFloatx80Exp( a );
3517
    if ( 0x403E <= aExp ) {
3518
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3519
            return propagateFloatx80NaN( a, a STATUS_VAR );
3520
        }
3521
        return a;
3522
    }
3523
    if ( aExp < 0x3FFF ) {
3524
        if (    ( aExp == 0 )
3525
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3526
            return a;
3527
        }
3528
        STATUS(float_exception_flags) |= float_flag_inexact;
3529
        aSign = extractFloatx80Sign( a );
3530
        switch ( STATUS(float_rounding_mode) ) {
3531
         case float_round_nearest_even:
3532
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3533
               ) {
3534
                return
3535
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3536
            }
3537
            break;
3538
         case float_round_down:
3539
            return
3540
                  aSign ?
3541
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3542
                : packFloatx80( 0, 0, 0 );
3543
         case float_round_up:
3544
            return
3545
                  aSign ? packFloatx80( 1, 0, 0 )
3546
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3547
        }
3548
        return packFloatx80( aSign, 0, 0 );
3549
    }
3550
    lastBitMask = 1;
3551
    lastBitMask <<= 0x403E - aExp;
3552
    roundBitsMask = lastBitMask - 1;
3553
    z = a;
3554
    roundingMode = STATUS(float_rounding_mode);
3555
    if ( roundingMode == float_round_nearest_even ) {
3556
        z.low += lastBitMask>>1;
3557
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3558
    }
3559
    else if ( roundingMode != float_round_to_zero ) {
3560
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3561
            z.low += roundBitsMask;
3562
        }
3563
    }
3564
    z.low &= ~ roundBitsMask;
3565
    if ( z.low == 0 ) {
3566
        ++z.high;
3567
        z.low = LIT64( 0x8000000000000000 );
3568
    }
3569
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3570
    return z;
3571

    
3572
}
3573

    
3574
/*----------------------------------------------------------------------------
3575
| Returns the result of adding the absolute values of the extended double-
3576
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3577
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3578
| The addition is performed according to the IEC/IEEE Standard for Binary
3579
| Floating-Point Arithmetic.
3580
*----------------------------------------------------------------------------*/
3581

    
3582
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3583
{
3584
    int32 aExp, bExp, zExp;
3585
    bits64 aSig, bSig, zSig0, zSig1;
3586
    int32 expDiff;
3587

    
3588
    aSig = extractFloatx80Frac( a );
3589
    aExp = extractFloatx80Exp( a );
3590
    bSig = extractFloatx80Frac( b );
3591
    bExp = extractFloatx80Exp( b );
3592
    expDiff = aExp - bExp;
3593
    if ( 0 < expDiff ) {
3594
        if ( aExp == 0x7FFF ) {
3595
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3596
            return a;
3597
        }
3598
        if ( bExp == 0 ) --expDiff;
3599
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3600
        zExp = aExp;
3601
    }
3602
    else if ( expDiff < 0 ) {
3603
        if ( bExp == 0x7FFF ) {
3604
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3605
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3606
        }
3607
        if ( aExp == 0 ) ++expDiff;
3608
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3609
        zExp = bExp;
3610
    }
3611
    else {
3612
        if ( aExp == 0x7FFF ) {
3613
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3614
                return propagateFloatx80NaN( a, b STATUS_VAR );
3615
            }
3616
            return a;
3617
        }
3618
        zSig1 = 0;
3619
        zSig0 = aSig + bSig;
3620
        if ( aExp == 0 ) {
3621
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3622
            goto roundAndPack;
3623
        }
3624
        zExp = aExp;
3625
        goto shiftRight1;
3626
    }
3627
    zSig0 = aSig + bSig;
3628
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3629
 shiftRight1:
3630
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631
    zSig0 |= LIT64( 0x8000000000000000 );
3632
    ++zExp;
3633
 roundAndPack:
3634
    return
3635
        roundAndPackFloatx80(
3636
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3637

    
3638
}
3639

    
3640
/*----------------------------------------------------------------------------
3641
| Returns the result of subtracting the absolute values of the extended
3642
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3643
| difference is negated before being returned.  `zSign' is ignored if the
3644
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3645
| Standard for Binary Floating-Point Arithmetic.
3646
*----------------------------------------------------------------------------*/
3647

    
3648
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3649
{
3650
    int32 aExp, bExp, zExp;
3651
    bits64 aSig, bSig, zSig0, zSig1;
3652
    int32 expDiff;
3653
    floatx80 z;
3654

    
3655
    aSig = extractFloatx80Frac( a );
3656
    aExp = extractFloatx80Exp( a );
3657
    bSig = extractFloatx80Frac( b );
3658
    bExp = extractFloatx80Exp( b );
3659
    expDiff = aExp - bExp;
3660
    if ( 0 < expDiff ) goto aExpBigger;
3661
    if ( expDiff < 0 ) goto bExpBigger;
3662
    if ( aExp == 0x7FFF ) {
3663
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3664
            return propagateFloatx80NaN( a, b STATUS_VAR );
3665
        }
3666
        float_raise( float_flag_invalid STATUS_VAR);
3667
        z.low = floatx80_default_nan_low;
3668
        z.high = floatx80_default_nan_high;
3669
        return z;
3670
    }
3671
    if ( aExp == 0 ) {
3672
        aExp = 1;
3673
        bExp = 1;
3674
    }
3675
    zSig1 = 0;
3676
    if ( bSig < aSig ) goto aBigger;
3677
    if ( aSig < bSig ) goto bBigger;
3678
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3679
 bExpBigger:
3680
    if ( bExp == 0x7FFF ) {
3681
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3682
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683
    }
3684
    if ( aExp == 0 ) ++expDiff;
3685
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3686
 bBigger:
3687
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3688
    zExp = bExp;
3689
    zSign ^= 1;
3690
    goto normalizeRoundAndPack;
3691
 aExpBigger:
3692
    if ( aExp == 0x7FFF ) {
3693
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3694
        return a;
3695
    }
3696
    if ( bExp == 0 ) --expDiff;
3697
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3698
 aBigger:
3699
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3700
    zExp = aExp;
3701
 normalizeRoundAndPack:
3702
    return
3703
        normalizeRoundAndPackFloatx80(
3704
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705

    
3706
}
3707

    
3708
/*----------------------------------------------------------------------------
3709
| Returns the result of adding the extended double-precision floating-point
3710
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3711
| Standard for Binary Floating-Point Arithmetic.
3712
*----------------------------------------------------------------------------*/
3713

    
3714
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3715
{
3716
    flag aSign, bSign;
3717

    
3718
    aSign = extractFloatx80Sign( a );
3719
    bSign = extractFloatx80Sign( b );
3720
    if ( aSign == bSign ) {
3721
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3722
    }
3723
    else {
3724
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3725
    }
3726

    
3727
}
3728

    
3729
/*----------------------------------------------------------------------------
3730
| Returns the result of subtracting the extended double-precision floating-
3731
| point values `a' and `b'.  The operation is performed according to the
3732
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3733
*----------------------------------------------------------------------------*/
3734

    
3735
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3736
{
3737
    flag aSign, bSign;
3738

    
3739
    aSign = extractFloatx80Sign( a );
3740
    bSign = extractFloatx80Sign( b );
3741
    if ( aSign == bSign ) {
3742
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3743
    }
3744
    else {
3745
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3746
    }
3747

    
3748
}
3749

    
3750
/*----------------------------------------------------------------------------
3751
| Returns the result of multiplying the extended double-precision floating-
3752
| point values `a' and `b'.  The operation is performed according to the
3753
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3754
*----------------------------------------------------------------------------*/
3755

    
3756
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3757
{
3758
    flag aSign, bSign, zSign;
3759
    int32 aExp, bExp, zExp;
3760
    bits64 aSig, bSig, zSig0, zSig1;
3761
    floatx80 z;
3762

    
3763
    aSig = extractFloatx80Frac( a );
3764
    aExp = extractFloatx80Exp( a );
3765
    aSign = extractFloatx80Sign( a );
3766
    bSig = extractFloatx80Frac( b );
3767
    bExp = extractFloatx80Exp( b );
3768
    bSign = extractFloatx80Sign( b );
3769
    zSign = aSign ^ bSign;
3770
    if ( aExp == 0x7FFF ) {
3771
        if (    (bits64) ( aSig<<1 )
3772
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3773
            return propagateFloatx80NaN( a, b STATUS_VAR );
3774
        }
3775
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3776
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3777
    }
3778
    if ( bExp == 0x7FFF ) {
3779
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3780
        if ( ( aExp | aSig ) == 0 ) {
3781
 invalid:
3782
            float_raise( float_flag_invalid STATUS_VAR);
3783
            z.low = floatx80_default_nan_low;
3784
            z.high = floatx80_default_nan_high;
3785
            return z;
3786
        }
3787
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3788
    }
3789
    if ( aExp == 0 ) {
3790
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3791
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3792
    }
3793
    if ( bExp == 0 ) {
3794
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3795
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3796
    }
3797
    zExp = aExp + bExp - 0x3FFE;
3798
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3799
    if ( 0 < (sbits64) zSig0 ) {
3800
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3801
        --zExp;
3802
    }
3803
    return
3804
        roundAndPackFloatx80(
3805
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3806

    
3807
}
3808

    
3809
/*----------------------------------------------------------------------------
3810
| Returns the result of dividing the extended double-precision floating-point
3811
| value `a' by the corresponding value `b'.  The operation is performed
3812
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3813
*----------------------------------------------------------------------------*/
3814

    
3815
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3816
{
3817
    flag aSign, bSign, zSign;
3818
    int32 aExp, bExp, zExp;
3819
    bits64 aSig, bSig, zSig0, zSig1;
3820
    bits64 rem0, rem1, rem2, term0, term1, term2;
3821
    floatx80 z;
3822

    
3823
    aSig = extractFloatx80Frac( a );
3824
    aExp = extractFloatx80Exp( a );
3825
    aSign = extractFloatx80Sign( a );
3826
    bSig = extractFloatx80Frac( b );
3827
    bExp = extractFloatx80Exp( b );
3828
    bSign = extractFloatx80Sign( b );
3829
    zSign = aSign ^ bSign;
3830
    if ( aExp == 0x7FFF ) {
3831
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3832
        if ( bExp == 0x7FFF ) {
3833
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3834
            goto invalid;
3835
        }
3836
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3837
    }
3838
    if ( bExp == 0x7FFF ) {
3839
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3840
        return packFloatx80( zSign, 0, 0 );
3841
    }
3842
    if ( bExp == 0 ) {
3843
        if ( bSig == 0 ) {
3844
            if ( ( aExp | aSig ) == 0 ) {
3845
 invalid:
3846
                float_raise( float_flag_invalid STATUS_VAR);
3847
                z.low = floatx80_default_nan_low;
3848
                z.high = floatx80_default_nan_high;
3849
                return z;
3850
            }
3851
            float_raise( float_flag_divbyzero STATUS_VAR);
3852
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3853
        }
3854
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3855
    }
3856
    if ( aExp == 0 ) {
3857
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3858
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3859
    }
3860
    zExp = aExp - bExp + 0x3FFE;
3861
    rem1 = 0;
3862
    if ( bSig <= aSig ) {
3863
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3864
        ++zExp;
3865
    }
3866
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3867
    mul64To128( bSig, zSig0, &term0, &term1 );
3868
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3869
    while ( (sbits64) rem0 < 0 ) {
3870
        --zSig0;
3871
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3872
    }
3873
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3874
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3875
        mul64To128( bSig, zSig1, &term1, &term2 );
3876
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3877
        while ( (sbits64) rem1 < 0 ) {
3878
            --zSig1;
3879
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3880
        }
3881
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3882
    }
3883
    return
3884
        roundAndPackFloatx80(
3885
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3886

    
3887
}
3888

    
3889
/*----------------------------------------------------------------------------
3890
| Returns the remainder of the extended double-precision floating-point value
3891
| `a' with respect to the corresponding value `b'.  The operation is performed
3892
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3893
*----------------------------------------------------------------------------*/
3894

    
3895
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3896
{
3897
    flag aSign, bSign, zSign;
3898
    int32 aExp, bExp, expDiff;
3899
    bits64 aSig0, aSig1, bSig;
3900
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3901
    floatx80 z;
3902

    
3903
    aSig0 = extractFloatx80Frac( a );
3904
    aExp = extractFloatx80Exp( a );
3905
    aSign = extractFloatx80Sign( a );
3906
    bSig = extractFloatx80Frac( b );
3907
    bExp = extractFloatx80Exp( b );
3908
    bSign = extractFloatx80Sign( b );
3909
    if ( aExp == 0x7FFF ) {
3910
        if (    (bits64) ( aSig0<<1 )
3911
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3912
            return propagateFloatx80NaN( a, b STATUS_VAR );
3913
        }
3914
        goto invalid;
3915
    }
3916
    if ( bExp == 0x7FFF ) {
3917
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3918
        return a;
3919
    }
3920
    if ( bExp == 0 ) {
3921
        if ( bSig == 0 ) {
3922
 invalid:
3923
            float_raise( float_flag_invalid STATUS_VAR);
3924
            z.low = floatx80_default_nan_low;
3925
            z.high = floatx80_default_nan_high;
3926
            return z;
3927
        }
3928
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3929
    }
3930
    if ( aExp == 0 ) {
3931
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3932
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3933
    }
3934
    bSig |= LIT64( 0x8000000000000000 );
3935
    zSign = aSign;
3936
    expDiff = aExp - bExp;
3937
    aSig1 = 0;
3938
    if ( expDiff < 0 ) {
3939
        if ( expDiff < -1 ) return a;
3940
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3941
        expDiff = 0;
3942
    }
3943
    q = ( bSig <= aSig0 );
3944
    if ( q ) aSig0 -= bSig;
3945
    expDiff -= 64;
3946
    while ( 0 < expDiff ) {
3947
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3948
        q = ( 2 < q ) ? q - 2 : 0;
3949
        mul64To128( bSig, q, &term0, &term1 );
3950
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3951
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3952
        expDiff -= 62;
3953
    }
3954
    expDiff += 64;
3955
    if ( 0 < expDiff ) {
3956
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3957
        q = ( 2 < q ) ? q - 2 : 0;
3958
        q >>= 64 - expDiff;
3959
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3960
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3962
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3963
            ++q;
3964
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3965
        }
3966
    }
3967
    else {
3968
        term1 = 0;
3969
        term0 = bSig;
3970
    }
3971
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3972
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3973
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3974
              && ( q & 1 ) )
3975
       ) {
3976
        aSig0 = alternateASig0;
3977
        aSig1 = alternateASig1;
3978
        zSign = ! zSign;
3979
    }
3980
    return
3981
        normalizeRoundAndPackFloatx80(
3982
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3983

    
3984
}
3985

    
3986
/*----------------------------------------------------------------------------
3987
| Returns the square root of the extended double-precision floating-point
3988
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3989
| for Binary Floating-Point Arithmetic.
3990
*----------------------------------------------------------------------------*/
3991

    
3992
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3993
{
3994
    flag aSign;
3995
    int32 aExp, zExp;
3996
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3997
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3998
    floatx80 z;
3999

    
4000
    aSig0 = extractFloatx80Frac( a );
4001
    aExp = extractFloatx80Exp( a );
4002
    aSign = extractFloatx80Sign( a );
4003
    if ( aExp == 0x7FFF ) {
4004
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4005
        if ( ! aSign ) return a;
4006
        goto invalid;
4007
    }
4008
    if ( aSign ) {
4009
        if ( ( aExp | aSig0 ) == 0 ) return a;
4010
 invalid:
4011
        float_raise( float_flag_invalid STATUS_VAR);
4012
        z.low = floatx80_default_nan_low;
4013
        z.high = floatx80_default_nan_high;
4014
        return z;
4015
    }
4016
    if ( aExp == 0 ) {
4017
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4018
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4019
    }
4020
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4021
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4022
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4023
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4024
    doubleZSig0 = zSig0<<1;
4025
    mul64To128( zSig0, zSig0, &term0, &term1 );
4026
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4027
    while ( (sbits64) rem0 < 0 ) {
4028
        --zSig0;
4029
        doubleZSig0 -= 2;
4030
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4031
    }
4032
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4033
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4034
        if ( zSig1 == 0 ) zSig1 = 1;
4035
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4036
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4037
        mul64To128( zSig1, zSig1, &term2, &term3 );
4038
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4039
        while ( (sbits64) rem1 < 0 ) {
4040
            --zSig1;
4041
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4042
            term3 |= 1;
4043
            term2 |= doubleZSig0;
4044
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4045
        }
4046
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4047
    }
4048
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4049
    zSig0 |= doubleZSig0;
4050
    return
4051
        roundAndPackFloatx80(
4052
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4053

    
4054
}
4055

    
4056
/*----------------------------------------------------------------------------
4057
| Returns 1 if the extended double-precision floating-point value `a' is
4058
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4059
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4060
| Arithmetic.
4061
*----------------------------------------------------------------------------*/
4062

    
4063
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4064
{
4065

    
4066
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4067
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4068
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4069
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4070
       ) {
4071
        if (    floatx80_is_signaling_nan( a )
4072
             || floatx80_is_signaling_nan( b ) ) {
4073
            float_raise( float_flag_invalid STATUS_VAR);
4074
        }
4075
        return 0;
4076
    }
4077
    return
4078
           ( a.low == b.low )
4079
        && (    ( a.high == b.high )
4080
             || (    ( a.low == 0 )
4081
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4082
           );
4083

    
4084
}
4085

    
4086
/*----------------------------------------------------------------------------
4087
| Returns 1 if the extended double-precision floating-point value `a' is
4088
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4089
| comparison is performed according to the IEC/IEEE Standard for Binary
4090
| Floating-Point Arithmetic.
4091
*----------------------------------------------------------------------------*/
4092

    
4093
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4094
{
4095
    flag aSign, bSign;
4096

    
4097
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4098
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4099
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4100
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4101
       ) {
4102
        float_raise( float_flag_invalid STATUS_VAR);
4103
        return 0;
4104
    }
4105
    aSign = extractFloatx80Sign( a );
4106
    bSign = extractFloatx80Sign( b );
4107
    if ( aSign != bSign ) {
4108
        return
4109
               aSign
4110
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4111
                 == 0 );
4112
    }
4113
    return
4114
          aSign ? le128( b.high, b.low, a.high, a.low )
4115
        : le128( a.high, a.low, b.high, b.low );
4116

    
4117
}
4118

    
4119
/*----------------------------------------------------------------------------
4120
| Returns 1 if the extended double-precision floating-point value `a' is
4121
| less than the corresponding value `b', and 0 otherwise.  The comparison
4122
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123
| Arithmetic.
4124
*----------------------------------------------------------------------------*/
4125

    
4126
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4127
{
4128
    flag aSign, bSign;
4129

    
4130
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4131
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4133
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4134
       ) {
4135
        float_raise( float_flag_invalid STATUS_VAR);
4136
        return 0;
4137
    }
4138
    aSign = extractFloatx80Sign( a );
4139
    bSign = extractFloatx80Sign( b );
4140
    if ( aSign != bSign ) {
4141
        return
4142
               aSign
4143
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144
                 != 0 );
4145
    }
4146
    return
4147
          aSign ? lt128( b.high, b.low, a.high, a.low )
4148
        : lt128( a.high, a.low, b.high, b.low );
4149

    
4150
}
4151

    
4152
/*----------------------------------------------------------------------------
4153
| Returns 1 if the extended double-precision floating-point value `a' is equal
4154
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4155
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4156
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4157
*----------------------------------------------------------------------------*/
4158

    
4159
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4160
{
4161

    
4162
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4163
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4164
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4165
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4166
       ) {
4167
        float_raise( float_flag_invalid STATUS_VAR);
4168
        return 0;
4169
    }
4170
    return
4171
           ( a.low == b.low )
4172
        && (    ( a.high == b.high )
4173
             || (    ( a.low == 0 )
4174
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4175
           );
4176

    
4177
}
4178

    
4179
/*----------------------------------------------------------------------------
4180
| Returns 1 if the extended double-precision floating-point value `a' is less
4181
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4182
| do not cause an exception.  Otherwise, the comparison is performed according
4183
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4184
*----------------------------------------------------------------------------*/
4185

    
4186
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4187
{
4188
    flag aSign, bSign;
4189

    
4190
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4191
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4192
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4193
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4194
       ) {
4195
        if (    floatx80_is_signaling_nan( a )
4196
             || floatx80_is_signaling_nan( b ) ) {
4197
            float_raise( float_flag_invalid STATUS_VAR);
4198
        }
4199
        return 0;
4200
    }
4201
    aSign = extractFloatx80Sign( a );
4202
    bSign = extractFloatx80Sign( b );
4203
    if ( aSign != bSign ) {
4204
        return
4205
               aSign
4206
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4207
                 == 0 );
4208
    }
4209
    return
4210
          aSign ? le128( b.high, b.low, a.high, a.low )
4211
        : le128( a.high, a.low, b.high, b.low );
4212

    
4213
}
4214

    
4215
/*----------------------------------------------------------------------------
4216
| Returns 1 if the extended double-precision floating-point value `a' is less
4217
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4218
| an exception.  Otherwise, the comparison is performed according to the
4219
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4220
*----------------------------------------------------------------------------*/
4221

    
4222
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4223
{
4224
    flag aSign, bSign;
4225

    
4226
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4227
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4228
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4229
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4230
       ) {
4231
        if (    floatx80_is_signaling_nan( a )
4232
             || floatx80_is_signaling_nan( b ) ) {
4233
            float_raise( float_flag_invalid STATUS_VAR);
4234
        }
4235
        return 0;
4236
    }
4237
    aSign = extractFloatx80Sign( a );
4238
    bSign = extractFloatx80Sign( b );
4239
    if ( aSign != bSign ) {
4240
        return
4241
               aSign
4242
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4243
                 != 0 );
4244
    }
4245
    return
4246
          aSign ? lt128( b.high, b.low, a.high, a.low )
4247
        : lt128( a.high, a.low, b.high, b.low );
4248

    
4249
}
4250

    
4251
#endif
4252

    
4253
#ifdef FLOAT128
4254

    
4255
/*----------------------------------------------------------------------------
4256
| Returns the result of converting the quadruple-precision floating-point
4257
| value `a' to the 32-bit two's complement integer format.  The conversion
4258
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259
| Arithmetic---which means in particular that the conversion is rounded
4260
| according to the current rounding mode.  If `a' is a NaN, the largest
4261
| positive integer is returned.  Otherwise, if the conversion overflows, the
4262
| largest integer with the same sign as `a' is returned.
4263
*----------------------------------------------------------------------------*/
4264

    
4265
int32 float128_to_int32( float128 a STATUS_PARAM )
4266
{
4267
    flag aSign;
4268
    int32 aExp, shiftCount;
4269
    bits64 aSig0, aSig1;
4270

    
4271
    aSig1 = extractFloat128Frac1( a );
4272
    aSig0 = extractFloat128Frac0( a );
4273
    aExp = extractFloat128Exp( a );
4274
    aSign = extractFloat128Sign( a );
4275
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4276
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4277
    aSig0 |= ( aSig1 != 0 );
4278
    shiftCount = 0x4028 - aExp;
4279
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4280
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4281

    
4282
}
4283

    
4284
/*----------------------------------------------------------------------------
4285
| Returns the result of converting the quadruple-precision floating-point
4286
| value `a' to the 32-bit two's complement integer format.  The conversion
4287
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4288
| Arithmetic, except that the conversion is always rounded toward zero.  If
4289
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4290
| conversion overflows, the largest integer with the same sign as `a' is
4291
| returned.
4292
*----------------------------------------------------------------------------*/
4293

    
4294
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4295
{
4296
    flag aSign;
4297
    int32 aExp, shiftCount;
4298
    bits64 aSig0, aSig1, savedASig;
4299
    int32 z;
4300

    
4301
    aSig1 = extractFloat128Frac1( a );
4302
    aSig0 = extractFloat128Frac0( a );
4303
    aExp = extractFloat128Exp( a );
4304
    aSign = extractFloat128Sign( a );
4305
    aSig0 |= ( aSig1 != 0 );
4306
    if ( 0x401E < aExp ) {
4307
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4308
        goto invalid;
4309
    }
4310
    else if ( aExp < 0x3FFF ) {
4311
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4312
        return 0;
4313
    }
4314
    aSig0 |= LIT64( 0x0001000000000000 );
4315
    shiftCount = 0x402F - aExp;
4316
    savedASig = aSig0;
4317
    aSig0 >>= shiftCount;
4318
    z = aSig0;
4319
    if ( aSign ) z = - z;
4320
    if ( ( z < 0 ) ^ aSign ) {
4321
 invalid:
4322
        float_raise( float_flag_invalid STATUS_VAR);
4323
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4324
    }
4325
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4326
        STATUS(float_exception_flags) |= float_flag_inexact;
4327
    }
4328
    return z;
4329

    
4330
}
4331

    
4332
/*----------------------------------------------------------------------------
4333
| Returns the result of converting the quadruple-precision floating-point
4334
| value `a' to the 64-bit two's complement integer format.  The conversion
4335
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4336
| Arithmetic---which means in particular that the conversion is rounded
4337
| according to the current rounding mode.  If `a' is a NaN, the largest
4338
| positive integer is returned.  Otherwise, if the conversion overflows, the
4339
| largest integer with the same sign as `a' is returned.
4340
*----------------------------------------------------------------------------*/
4341

    
4342
int64 float128_to_int64( float128 a STATUS_PARAM )
4343
{
4344
    flag aSign;
4345
    int32 aExp, shiftCount;
4346
    bits64 aSig0, aSig1;
4347

    
4348
    aSig1 = extractFloat128Frac1( a );
4349
    aSig0 = extractFloat128Frac0( a );
4350
    aExp = extractFloat128Exp( a );
4351
    aSign = extractFloat128Sign( a );
4352
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4353
    shiftCount = 0x402F - aExp;
4354
    if ( shiftCount <= 0 ) {
4355
        if ( 0x403E < aExp ) {
4356
            float_raise( float_flag_invalid STATUS_VAR);
4357
            if (    ! aSign
4358
                 || (    ( aExp == 0x7FFF )
4359
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4360
                    )
4361
               ) {
4362
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4363
            }
4364
            return (sbits64) LIT64( 0x8000000000000000 );
4365
        }
4366
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4367
    }
4368
    else {
4369
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4370
    }
4371
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4372

    
4373
}
4374

    
4375
/*----------------------------------------------------------------------------
4376
| Returns the result of converting the quadruple-precision floating-point
4377
| value `a' to the 64-bit two's complement integer format.  The conversion
4378
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4379
| Arithmetic, except that the conversion is always rounded toward zero.
4380
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4381
| the conversion overflows, the largest integer with the same sign as `a' is
4382
| returned.
4383
*----------------------------------------------------------------------------*/
4384

    
4385
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4386
{
4387
    flag aSign;
4388
    int32 aExp, shiftCount;
4389
    bits64 aSig0, aSig1;
4390
    int64 z;
4391

    
4392
    aSig1 = extractFloat128Frac1( a );
4393
    aSig0 = extractFloat128Frac0( a );
4394
    aExp = extractFloat128Exp( a );
4395
    aSign = extractFloat128Sign( a );
4396
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4397
    shiftCount = aExp - 0x402F;
4398
    if ( 0 < shiftCount ) {
4399
        if ( 0x403E <= aExp ) {
4400
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4401
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4402
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4403
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4404
            }
4405
            else {
4406
                float_raise( float_flag_invalid STATUS_VAR);
4407
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4408
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4409
                }
4410
            }
4411
            return (sbits64) LIT64( 0x8000000000000000 );
4412
        }
4413
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4414
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4415
            STATUS(float_exception_flags) |= float_flag_inexact;
4416
        }
4417
    }
4418
    else {
4419
        if ( aExp < 0x3FFF ) {
4420
            if ( aExp | aSig0 | aSig1 ) {
4421
                STATUS(float_exception_flags) |= float_flag_inexact;
4422
            }
4423
            return 0;
4424
        }
4425
        z = aSig0>>( - shiftCount );
4426
        if (    aSig1
4427
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4428
            STATUS(float_exception_flags) |= float_flag_inexact;
4429
        }
4430
    }
4431
    if ( aSign ) z = - z;
4432
    return z;
4433

    
4434
}
4435

    
4436
/*----------------------------------------------------------------------------
4437
| Returns the result of converting the quadruple-precision floating-point
4438
| value `a' to the single-precision floating-point format.  The conversion
4439
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4440
| Arithmetic.
4441
*----------------------------------------------------------------------------*/
4442

    
4443
float32 float128_to_float32( float128 a STATUS_PARAM )
4444
{
4445
    flag aSign;
4446
    int32 aExp;
4447
    bits64 aSig0, aSig1;
4448
    bits32 zSig;
4449

    
4450
    aSig1 = extractFloat128Frac1( a );
4451
    aSig0 = extractFloat128Frac0( a );
4452
    aExp = extractFloat128Exp( a );
4453
    aSign = extractFloat128Sign( a );
4454
    if ( aExp == 0x7FFF ) {
4455
        if ( aSig0 | aSig1 ) {
4456
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4457
        }
4458
        return packFloat32( aSign, 0xFF, 0 );
4459
    }
4460
    aSig0 |= ( aSig1 != 0 );
4461
    shift64RightJamming( aSig0, 18, &aSig0 );
4462
    zSig = aSig0;
4463
    if ( aExp || zSig ) {
4464
        zSig |= 0x40000000;
4465
        aExp -= 0x3F81;
4466
    }
4467
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4468

    
4469
}
4470

    
4471
/*----------------------------------------------------------------------------
4472
| Returns the result of converting the quadruple-precision floating-point
4473
| value `a' to the double-precision floating-point format.  The conversion
4474
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4475
| Arithmetic.
4476
*----------------------------------------------------------------------------*/
4477

    
4478
float64 float128_to_float64( float128 a STATUS_PARAM )
4479
{
4480
    flag aSign;
4481
    int32 aExp;
4482
    bits64 aSig0, aSig1;
4483

    
4484
    aSig1 = extractFloat128Frac1( a );
4485
    aSig0 = extractFloat128Frac0( a );
4486
    aExp = extractFloat128Exp( a );
4487
    aSign = extractFloat128Sign( a );
4488
    if ( aExp == 0x7FFF ) {
4489
        if ( aSig0 | aSig1 ) {
4490
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4491
        }
4492
        return packFloat64( aSign, 0x7FF, 0 );
4493
    }
4494
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4495
    aSig0 |= ( aSig1 != 0 );
4496
    if ( aExp || aSig0 ) {
4497
        aSig0 |= LIT64( 0x4000000000000000 );
4498
        aExp -= 0x3C01;
4499
    }
4500
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4501

    
4502
}
4503

    
4504
#ifdef FLOATX80
4505

    
4506
/*----------------------------------------------------------------------------
4507
| Returns the result of converting the quadruple-precision floating-point
4508
| value `a' to the extended double-precision floating-point format.  The
4509
| conversion is performed according to the IEC/IEEE Standard for Binary
4510
| Floating-Point Arithmetic.
4511
*----------------------------------------------------------------------------*/
4512

    
4513
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4514
{
4515
    flag aSign;
4516
    int32 aExp;
4517
    bits64 aSig0, aSig1;
4518

    
4519
    aSig1 = extractFloat128Frac1( a );
4520
    aSig0 = extractFloat128Frac0( a );
4521
    aExp = extractFloat128Exp( a );
4522
    aSign = extractFloat128Sign( a );
4523
    if ( aExp == 0x7FFF ) {
4524
        if ( aSig0 | aSig1 ) {
4525
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4526
        }
4527
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4528
    }
4529
    if ( aExp == 0 ) {
4530
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4531
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4532
    }
4533
    else {
4534
        aSig0 |= LIT64( 0x0001000000000000 );
4535
    }
4536
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4537
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4538

    
4539
}
4540

    
4541
#endif
4542

    
4543
/*----------------------------------------------------------------------------
4544
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4545
| returns the result as a quadruple-precision floating-point value.  The
4546
| operation is performed according to the IEC/IEEE Standard for Binary
4547
| Floating-Point Arithmetic.
4548
*----------------------------------------------------------------------------*/
4549

    
4550
float128 float128_round_to_int( float128 a STATUS_PARAM )
4551
{
4552
    flag aSign;
4553
    int32 aExp;
4554
    bits64 lastBitMask, roundBitsMask;
4555
    int8 roundingMode;
4556
    float128 z;
4557

    
4558
    aExp = extractFloat128Exp( a );
4559
    if ( 0x402F <= aExp ) {
4560
        if ( 0x406F <= aExp ) {
4561
            if (    ( aExp == 0x7FFF )
4562
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4563
               ) {
4564
                return propagateFloat128NaN( a, a STATUS_VAR );
4565
            }
4566
            return a;
4567
        }
4568
        lastBitMask = 1;
4569
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4570
        roundBitsMask = lastBitMask - 1;
4571
        z = a;
4572
        roundingMode = STATUS(float_rounding_mode);
4573
        if ( roundingMode == float_round_nearest_even ) {
4574
            if ( lastBitMask ) {
4575
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4576
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4577
            }
4578
            else {
4579
                if ( (sbits64) z.low < 0 ) {
4580
                    ++z.high;
4581
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4582
                }
4583
            }
4584
        }
4585
        else if ( roundingMode != float_round_to_zero ) {
4586
            if (   extractFloat128Sign( z )
4587
                 ^ ( roundingMode == float_round_up ) ) {
4588
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4589
            }
4590
        }
4591
        z.low &= ~ roundBitsMask;
4592
    }
4593
    else {
4594
        if ( aExp < 0x3FFF ) {
4595
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4596
            STATUS(float_exception_flags) |= float_flag_inexact;
4597
            aSign = extractFloat128Sign( a );
4598
            switch ( STATUS(float_rounding_mode) ) {
4599
             case float_round_nearest_even:
4600
                if (    ( aExp == 0x3FFE )
4601
                     && (   extractFloat128Frac0( a )
4602
                          | extractFloat128Frac1( a ) )
4603
                   ) {
4604
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4605
                }
4606
                break;
4607
             case float_round_down:
4608
                return
4609
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4610
                    : packFloat128( 0, 0, 0, 0 );
4611
             case float_round_up:
4612
                return
4613
                      aSign ? packFloat128( 1, 0, 0, 0 )
4614
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4615
            }
4616
            return packFloat128( aSign, 0, 0, 0 );
4617
        }
4618
        lastBitMask = 1;
4619
        lastBitMask <<= 0x402F - aExp;
4620
        roundBitsMask = lastBitMask - 1;
4621
        z.low = 0;
4622
        z.high = a.high;
4623
        roundingMode = STATUS(float_rounding_mode);
4624
        if ( roundingMode == float_round_nearest_even ) {
4625
            z.high += lastBitMask>>1;
4626
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4627
                z.high &= ~ lastBitMask;
4628
            }
4629
        }
4630
        else if ( roundingMode != float_round_to_zero ) {
4631
            if (   extractFloat128Sign( z )
4632
                 ^ ( roundingMode == float_round_up ) ) {
4633
                z.high |= ( a.low != 0 );
4634
                z.high += roundBitsMask;
4635
            }
4636
        }
4637
        z.high &= ~ roundBitsMask;
4638
    }
4639
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4640
        STATUS(float_exception_flags) |= float_flag_inexact;
4641
    }
4642
    return z;
4643

    
4644
}
4645

    
4646
/*----------------------------------------------------------------------------
4647
| Returns the result of adding the absolute values of the quadruple-precision
4648
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4649
| before being returned.  `zSign' is ignored if the result is a NaN.
4650
| The addition is performed according to the IEC/IEEE Standard for Binary
4651
| Floating-Point Arithmetic.
4652
*----------------------------------------------------------------------------*/
4653

    
4654
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4655
{
4656
    int32 aExp, bExp, zExp;
4657
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4658
    int32 expDiff;
4659

    
4660
    aSig1 = extractFloat128Frac1( a );
4661
    aSig0 = extractFloat128Frac0( a );
4662
    aExp = extractFloat128Exp( a );
4663
    bSig1 = extractFloat128Frac1( b );
4664
    bSig0 = extractFloat128Frac0( b );
4665
    bExp = extractFloat128Exp( b );
4666
    expDiff = aExp - bExp;
4667
    if ( 0 < expDiff ) {
4668
        if ( aExp == 0x7FFF ) {
4669
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4670
            return a;
4671
        }
4672
        if ( bExp == 0 ) {
4673
            --expDiff;
4674
        }
4675
        else {
4676
            bSig0 |= LIT64( 0x0001000000000000 );
4677
        }
4678
        shift128ExtraRightJamming(
4679
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4680
        zExp = aExp;
4681
    }
4682
    else if ( expDiff < 0 ) {
4683
        if ( bExp == 0x7FFF ) {
4684
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4685
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4686
        }
4687
        if ( aExp == 0 ) {
4688
            ++expDiff;
4689
        }
4690
        else {
4691
            aSig0 |= LIT64( 0x0001000000000000 );
4692
        }
4693
        shift128ExtraRightJamming(
4694
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4695
        zExp = bExp;
4696
    }
4697
    else {
4698
        if ( aExp == 0x7FFF ) {
4699
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4700
                return propagateFloat128NaN( a, b STATUS_VAR );
4701
            }
4702
            return a;
4703
        }
4704
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4705
        if ( aExp == 0 ) {
4706
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4707
            return packFloat128( zSign, 0, zSig0, zSig1 );
4708
        }
4709
        zSig2 = 0;
4710
        zSig0 |= LIT64( 0x0002000000000000 );
4711
        zExp = aExp;
4712
        goto shiftRight1;
4713
    }
4714
    aSig0 |= LIT64( 0x0001000000000000 );
4715
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4716
    --zExp;
4717
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4718
    ++zExp;
4719
 shiftRight1:
4720
    shift128ExtraRightJamming(
4721
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4722
 roundAndPack:
4723
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4724

    
4725
}
4726

    
4727
/*----------------------------------------------------------------------------
4728
| Returns the result of subtracting the absolute values of the quadruple-
4729
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4730
| difference is negated before being returned.  `zSign' is ignored if the
4731
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4732
| Standard for Binary Floating-Point Arithmetic.
4733
*----------------------------------------------------------------------------*/
4734

    
4735
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4736
{
4737
    int32 aExp, bExp, zExp;
4738
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4739
    int32 expDiff;
4740
    float128 z;
4741

    
4742
    aSig1 = extractFloat128Frac1( a );
4743
    aSig0 = extractFloat128Frac0( a );
4744
    aExp = extractFloat128Exp( a );
4745
    bSig1 = extractFloat128Frac1( b );
4746
    bSig0 = extractFloat128Frac0( b );
4747
    bExp = extractFloat128Exp( b );
4748
    expDiff = aExp - bExp;
4749
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4750
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4751
    if ( 0 < expDiff ) goto aExpBigger;
4752
    if ( expDiff < 0 ) goto bExpBigger;
4753
    if ( aExp == 0x7FFF ) {
4754
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4755
            return propagateFloat128NaN( a, b STATUS_VAR );
4756
        }
4757
        float_raise( float_flag_invalid STATUS_VAR);
4758
        z.low = float128_default_nan_low;
4759
        z.high = float128_default_nan_high;
4760
        return z;
4761
    }
4762
    if ( aExp == 0 ) {
4763
        aExp = 1;
4764
        bExp = 1;
4765
    }
4766
    if ( bSig0 < aSig0 ) goto aBigger;
4767
    if ( aSig0 < bSig0 ) goto bBigger;
4768
    if ( bSig1 < aSig1 ) goto aBigger;
4769
    if ( aSig1 < bSig1 ) goto bBigger;
4770
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4771
 bExpBigger:
4772
    if ( bExp == 0x7FFF ) {
4773
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4774
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4775
    }
4776
    if ( aExp == 0 ) {
4777
        ++expDiff;
4778
    }
4779
    else {
4780
        aSig0 |= LIT64( 0x4000000000000000 );
4781
    }
4782
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4783
    bSig0 |= LIT64( 0x4000000000000000 );
4784
 bBigger:
4785
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4786
    zExp = bExp;
4787
    zSign ^= 1;
4788
    goto normalizeRoundAndPack;
4789
 aExpBigger:
4790
    if ( aExp == 0x7FFF ) {
4791
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4792
        return a;
4793
    }
4794
    if ( bExp == 0 ) {
4795
        --expDiff;
4796
    }
4797
    else {
4798
        bSig0 |= LIT64( 0x4000000000000000 );
4799
    }
4800
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4801
    aSig0 |= LIT64( 0x4000000000000000 );
4802
 aBigger:
4803
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4804
    zExp = aExp;
4805
 normalizeRoundAndPack:
4806
    --zExp;
4807
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4808

    
4809
}
4810

    
4811
/*----------------------------------------------------------------------------
4812
| Returns the result of adding the quadruple-precision floating-point values
4813
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4814
| for Binary Floating-Point Arithmetic.
4815
*----------------------------------------------------------------------------*/
4816

    
4817
float128 float128_add( float128 a, float128 b STATUS_PARAM )
4818
{
4819
    flag aSign, bSign;
4820

    
4821
    aSign = extractFloat128Sign( a );
4822
    bSign = extractFloat128Sign( b );
4823
    if ( aSign == bSign ) {
4824
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4825
    }
4826
    else {
4827
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4828
    }
4829

    
4830
}
4831

    
4832
/*----------------------------------------------------------------------------
4833
| Returns the result of subtracting the quadruple-precision floating-point
4834
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4835
| Standard for Binary Floating-Point Arithmetic.
4836
*----------------------------------------------------------------------------*/
4837

    
4838
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4839
{
4840
    flag aSign, bSign;
4841

    
4842
    aSign = extractFloat128Sign( a );
4843
    bSign = extractFloat128Sign( b );
4844
    if ( aSign == bSign ) {
4845
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4846
    }
4847
    else {
4848
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4849
    }
4850

    
4851
}
4852

    
4853
/*----------------------------------------------------------------------------
4854
| Returns the result of multiplying the quadruple-precision floating-point
4855
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4856
| Standard for Binary Floating-Point Arithmetic.
4857
*----------------------------------------------------------------------------*/
4858

    
4859
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4860
{
4861
    flag aSign, bSign, zSign;
4862
    int32 aExp, bExp, zExp;
4863
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4864
    float128 z;
4865

    
4866
    aSig1 = extractFloat128Frac1( a );
4867
    aSig0 = extractFloat128Frac0( a );
4868
    aExp = extractFloat128Exp( a );
4869
    aSign = extractFloat128Sign( a );
4870
    bSig1 = extractFloat128Frac1( b );
4871
    bSig0 = extractFloat128Frac0( b );
4872
    bExp = extractFloat128Exp( b );
4873
    bSign = extractFloat128Sign( b );
4874
    zSign = aSign ^ bSign;
4875
    if ( aExp == 0x7FFF ) {
4876
        if (    ( aSig0 | aSig1 )
4877
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4878
            return propagateFloat128NaN( a, b STATUS_VAR );
4879
        }
4880
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4881
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4882
    }
4883
    if ( bExp == 0x7FFF ) {
4884
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4885
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4886
 invalid:
4887
            float_raise( float_flag_invalid STATUS_VAR);
4888
            z.low = float128_default_nan_low;
4889
            z.high = float128_default_nan_high;
4890
            return z;
4891
        }
4892
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4893
    }
4894
    if ( aExp == 0 ) {
4895
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4896
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4897
    }
4898
    if ( bExp == 0 ) {
4899
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4900
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4901
    }
4902
    zExp = aExp + bExp - 0x4000;
4903
    aSig0 |= LIT64( 0x0001000000000000 );
4904
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4905
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4906
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4907
    zSig2 |= ( zSig3 != 0 );
4908
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4909
        shift128ExtraRightJamming(
4910
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4911
        ++zExp;
4912
    }
4913
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4914

    
4915
}
4916

    
4917
/*----------------------------------------------------------------------------
4918
| Returns the result of dividing the quadruple-precision floating-point value
4919
| `a' by the corresponding value `b'.  The operation is performed according to
4920
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4921
*----------------------------------------------------------------------------*/
4922

    
4923
float128 float128_div( float128 a, float128 b STATUS_PARAM )
4924
{
4925
    flag aSign, bSign, zSign;
4926
    int32 aExp, bExp, zExp;
4927
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4928
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4929
    float128 z;
4930

    
4931
    aSig1 = extractFloat128Frac1( a );
4932
    aSig0 = extractFloat128Frac0( a );
4933
    aExp = extractFloat128Exp( a );
4934
    aSign = extractFloat128Sign( a );
4935
    bSig1 = extractFloat128Frac1( b );
4936
    bSig0 = extractFloat128Frac0( b );
4937
    bExp = extractFloat128Exp( b );
4938
    bSign = extractFloat128Sign( b );
4939
    zSign = aSign ^ bSign;
4940
    if ( aExp == 0x7FFF ) {
4941
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4942
        if ( bExp == 0x7FFF ) {
4943
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4944
            goto invalid;
4945
        }
4946
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4947
    }
4948
    if ( bExp == 0x7FFF ) {
4949
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4950
        return packFloat128( zSign, 0, 0, 0 );
4951
    }
4952
    if ( bExp == 0 ) {
4953
        if ( ( bSig0 | bSig1 ) == 0 ) {
4954
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4955
 invalid:
4956
                float_raise( float_flag_invalid STATUS_VAR);
4957
                z.low = float128_default_nan_low;
4958
                z.high = float128_default_nan_high;
4959
                return z;
4960
            }
4961
            float_raise( float_flag_divbyzero STATUS_VAR);
4962
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4963
        }
4964
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4965
    }
4966
    if ( aExp == 0 ) {
4967
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4968
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4969
    }
4970
    zExp = aExp - bExp + 0x3FFD;
4971
    shortShift128Left(
4972
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4973
    shortShift128Left(
4974
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4975
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4976
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4977
        ++zExp;
4978
    }
4979
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4980
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4981
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4982
    while ( (sbits64) rem0 < 0 ) {
4983
        --zSig0;
4984
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4985
    }
4986
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4987
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4988
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4989
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4990
        while ( (sbits64) rem1 < 0 ) {
4991
            --zSig1;
4992
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4993
        }
4994
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4995
    }
4996
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4997
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4998

    
4999
}
5000

    
5001
/*----------------------------------------------------------------------------
5002
| Returns the remainder of the quadruple-precision floating-point value `a'
5003
| with respect to the corresponding value `b'.  The operation is performed
5004
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5005
*----------------------------------------------------------------------------*/
5006

    
5007
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5008
{
5009
    flag aSign, bSign, zSign;
5010
    int32 aExp, bExp, expDiff;
5011
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5012
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5013
    sbits64 sigMean0;
5014
    float128 z;
5015

    
5016
    aSig1 = extractFloat128Frac1( a );
5017
    aSig0 = extractFloat128Frac0( a );
5018
    aExp = extractFloat128Exp( a );
5019
    aSign = extractFloat128Sign( a );
5020
    bSig1 = extractFloat128Frac1( b );
5021
    bSig0 = extractFloat128Frac0( b );
5022
    bExp = extractFloat128Exp( b );
5023
    bSign = extractFloat128Sign( b );
5024
    if ( aExp == 0x7FFF ) {
5025
        if (    ( aSig0 | aSig1 )
5026
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5027
            return propagateFloat128NaN( a, b STATUS_VAR );
5028
        }
5029
        goto invalid;
5030
    }
5031
    if ( bExp == 0x7FFF ) {
5032
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5033
        return a;
5034
    }
5035
    if ( bExp == 0 ) {
5036
        if ( ( bSig0 | bSig1 ) == 0 ) {
5037
 invalid:
5038
            float_raise( float_flag_invalid STATUS_VAR);
5039
            z.low = float128_default_nan_low;
5040
            z.high = float128_default_nan_high;
5041
            return z;
5042
        }
5043
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5044
    }
5045
    if ( aExp == 0 ) {
5046
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5047
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5048
    }
5049
    expDiff = aExp - bExp;
5050
    if ( expDiff < -1 ) return a;
5051
    shortShift128Left(
5052
        aSig0 | LIT64( 0x0001000000000000 ),
5053
        aSig1,
5054
        15 - ( expDiff < 0 ),
5055
        &aSig0,
5056
        &aSig1
5057
    );
5058
    shortShift128Left(
5059
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5060
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5061
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5062
    expDiff -= 64;
5063
    while ( 0 < expDiff ) {
5064
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5065
        q = ( 4 < q ) ? q - 4 : 0;
5066
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5067
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5068
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5069
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5070
        expDiff -= 61;
5071
    }
5072
    if ( -64 < expDiff ) {
5073
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5074
        q = ( 4 < q ) ? q - 4 : 0;
5075
        q >>= - expDiff;
5076
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5077
        expDiff += 52;
5078
        if ( expDiff < 0 ) {
5079
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5080
        }
5081
        else {
5082
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5083
        }
5084
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5085
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5086
    }
5087
    else {
5088
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5089
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5090
    }
5091
    do {
5092
        alternateASig0 = aSig0;
5093
        alternateASig1 = aSig1;
5094
        ++q;
5095
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5096
    } while ( 0 <= (sbits64) aSig0 );
5097
    add128(
5098
        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5099
    if (    ( sigMean0 < 0 )
5100
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5101
        aSig0 = alternateASig0;
5102
        aSig1 = alternateASig1;
5103
    }
5104
    zSign = ( (sbits64) aSig0 < 0 );
5105
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5106
    return
5107
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5108

    
5109
}
5110

    
5111
/*----------------------------------------------------------------------------
5112
| Returns the square root of the quadruple-precision floating-point value `a'.
5113
| The operation is performed according to the IEC/IEEE Standard for Binary
5114
| Floating-Point Arithmetic.
5115
*----------------------------------------------------------------------------*/
5116

    
5117
float128 float128_sqrt( float128 a STATUS_PARAM )
5118
{
5119
    flag aSign;
5120
    int32 aExp, zExp;
5121
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5122
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5123
    float128 z;
5124

    
5125
    aSig1 = extractFloat128Frac1( a );
5126
    aSig0 = extractFloat128Frac0( a );
5127
    aExp = extractFloat128Exp( a );
5128
    aSign = extractFloat128Sign( a );
5129
    if ( aExp == 0x7FFF ) {
5130
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5131
        if ( ! aSign ) return a;
5132
        goto invalid;
5133
    }
5134
    if ( aSign ) {
5135
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5136
 invalid:
5137
        float_raise( float_flag_invalid STATUS_VAR);
5138
        z.low = float128_default_nan_low;
5139
        z.high = float128_default_nan_high;
5140
        return z;
5141
    }
5142
    if ( aExp == 0 ) {
5143
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5144
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5145
    }
5146
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5147
    aSig0 |= LIT64( 0x0001000000000000 );
5148
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5149
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5150
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5151
    doubleZSig0 = zSig0<<1;
5152
    mul64To128( zSig0, zSig0, &term0, &term1 );
5153
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5154
    while ( (sbits64) rem0 < 0 ) {
5155
        --zSig0;
5156
        doubleZSig0 -= 2;
5157
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5158
    }
5159
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5160
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5161
        if ( zSig1 == 0 ) zSig1 = 1;
5162
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5163
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5164
        mul64To128( zSig1, zSig1, &term2, &term3 );
5165
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5166
        while ( (sbits64) rem1 < 0 ) {
5167
            --zSig1;
5168
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5169
            term3 |= 1;
5170
            term2 |= doubleZSig0;
5171
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5172
        }
5173
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5174
    }
5175
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5176
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5177

    
5178
}
5179

    
5180
/*----------------------------------------------------------------------------
5181
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5182
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5183
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184
*----------------------------------------------------------------------------*/
5185

    
5186
int float128_eq( float128 a, float128 b STATUS_PARAM )
5187
{
5188

    
5189
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5190
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5191
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5192
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5193
       ) {
5194
        if (    float128_is_signaling_nan( a )
5195
             || float128_is_signaling_nan( b ) ) {
5196
            float_raise( float_flag_invalid STATUS_VAR);
5197
        }
5198
        return 0;
5199
    }
5200
    return
5201
           ( a.low == b.low )
5202
        && (    ( a.high == b.high )
5203
             || (    ( a.low == 0 )
5204
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5205
           );
5206

    
5207
}
5208

    
5209
/*----------------------------------------------------------------------------
5210
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5211
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5212
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5213
| Arithmetic.
5214
*----------------------------------------------------------------------------*/
5215

    
5216
int float128_le( float128 a, float128 b STATUS_PARAM )
5217
{
5218
    flag aSign, bSign;
5219

    
5220
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5221
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5222
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5223
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5224
       ) {
5225
        float_raise( float_flag_invalid STATUS_VAR);
5226
        return 0;
5227
    }
5228
    aSign = extractFloat128Sign( a );
5229
    bSign = extractFloat128Sign( b );
5230
    if ( aSign != bSign ) {
5231
        return
5232
               aSign
5233
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5234
                 == 0 );
5235
    }
5236
    return
5237
          aSign ? le128( b.high, b.low, a.high, a.low )
5238
        : le128( a.high, a.low, b.high, b.low );
5239

    
5240
}
5241

    
5242
/*----------------------------------------------------------------------------
5243
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5244
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5245
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5246
*----------------------------------------------------------------------------*/
5247

    
5248
int float128_lt( float128 a, float128 b STATUS_PARAM )
5249
{
5250
    flag aSign, bSign;
5251

    
5252
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5253
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5254
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5255
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5256
       ) {
5257
        float_raise( float_flag_invalid STATUS_VAR);
5258
        return 0;
5259
    }
5260
    aSign = extractFloat128Sign( a );
5261
    bSign = extractFloat128Sign( b );
5262
    if ( aSign != bSign ) {
5263
        return
5264
               aSign
5265
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5266
                 != 0 );
5267
    }
5268
    return
5269
          aSign ? lt128( b.high, b.low, a.high, a.low )
5270
        : lt128( a.high, a.low, b.high, b.low );
5271

    
5272
}
5273

    
5274
/*----------------------------------------------------------------------------
5275
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5276
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5277
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5278
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5279
*----------------------------------------------------------------------------*/
5280

    
5281
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5282
{
5283

    
5284
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5285
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5286
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5287
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5288
       ) {
5289
        float_raise( float_flag_invalid STATUS_VAR);
5290
        return 0;
5291
    }
5292
    return
5293
           ( a.low == b.low )
5294
        && (    ( a.high == b.high )
5295
             || (    ( a.low == 0 )
5296
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5297
           );
5298

    
5299
}
5300

    
5301
/*----------------------------------------------------------------------------
5302
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5303
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5304
| cause an exception.  Otherwise, the comparison is performed according to the
5305
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5306
*----------------------------------------------------------------------------*/
5307

    
5308
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5309
{
5310
    flag aSign, bSign;
5311

    
5312
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5313
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5314
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5315
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5316
       ) {
5317
        if (    float128_is_signaling_nan( a )
5318
             || float128_is_signaling_nan( b ) ) {
5319
            float_raise( float_flag_invalid STATUS_VAR);
5320
        }
5321
        return 0;
5322
    }
5323
    aSign = extractFloat128Sign( a );
5324
    bSign = extractFloat128Sign( b );
5325
    if ( aSign != bSign ) {
5326
        return
5327
               aSign
5328
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5329
                 == 0 );
5330
    }
5331
    return
5332
          aSign ? le128( b.high, b.low, a.high, a.low )
5333
        : le128( a.high, a.low, b.high, b.low );
5334

    
5335
}
5336

    
5337
/*----------------------------------------------------------------------------
5338
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5339
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5340
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5341
| Standard for Binary Floating-Point Arithmetic.
5342
*----------------------------------------------------------------------------*/
5343

    
5344
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5345
{
5346
    flag aSign, bSign;
5347

    
5348
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5349
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5350
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5351
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5352
       ) {
5353
        if (    float128_is_signaling_nan( a )
5354
             || float128_is_signaling_nan( b ) ) {
5355
            float_raise( float_flag_invalid STATUS_VAR);
5356
        }
5357
        return 0;
5358
    }
5359
    aSign = extractFloat128Sign( a );
5360
    bSign = extractFloat128Sign( b );
5361
    if ( aSign != bSign ) {
5362
        return
5363
               aSign
5364
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5365
                 != 0 );
5366
    }
5367
    return
5368
          aSign ? lt128( b.high, b.low, a.high, a.low )
5369
        : lt128( a.high, a.low, b.high, b.low );
5370

    
5371
}
5372

    
5373
#endif
5374

    
5375
/* misc functions */
5376
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5377
{
5378
    return int64_to_float32(a STATUS_VAR);
5379
}
5380

    
5381
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5382
{
5383
    return int64_to_float64(a STATUS_VAR);
5384
}
5385

    
5386
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5387
{
5388
    int64_t v;
5389
    unsigned int res;
5390

    
5391
    v = float32_to_int64(a STATUS_VAR);
5392
    if (v < 0) {
5393
        res = 0;
5394
        float_raise( float_flag_invalid STATUS_VAR);
5395
    } else if (v > 0xffffffff) {
5396
        res = 0xffffffff;
5397
        float_raise( float_flag_invalid STATUS_VAR);
5398
    } else {
5399
        res = v;
5400
    }
5401
    return res;
5402
}
5403

    
5404
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5405
{
5406
    int64_t v;
5407
    unsigned int res;
5408

    
5409
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5410
    if (v < 0) {
5411
        res = 0;
5412
        float_raise( float_flag_invalid STATUS_VAR);
5413
    } else if (v > 0xffffffff) {
5414
        res = 0xffffffff;
5415
        float_raise( float_flag_invalid STATUS_VAR);
5416
    } else {
5417
        res = v;
5418
    }
5419
    return res;
5420
}
5421

    
5422
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5423
{
5424
    int64_t v;
5425
    unsigned int res;
5426

    
5427
    v = float64_to_int64(a STATUS_VAR);
5428
    if (v < 0) {
5429
        res = 0;
5430
        float_raise( float_flag_invalid STATUS_VAR);
5431
    } else if (v > 0xffffffff) {
5432
        res = 0xffffffff;
5433
        float_raise( float_flag_invalid STATUS_VAR);
5434
    } else {
5435
        res = v;
5436
    }
5437
    return res;
5438
}
5439

    
5440
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5441
{
5442
    int64_t v;
5443
    unsigned int res;
5444

    
5445
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5446
    if (v < 0) {
5447
        res = 0;
5448
        float_raise( float_flag_invalid STATUS_VAR);
5449
    } else if (v > 0xffffffff) {
5450
        res = 0xffffffff;
5451
        float_raise( float_flag_invalid STATUS_VAR);
5452
    } else {
5453
        res = v;
5454
    }
5455
    return res;
5456
}
5457

    
5458
/* FIXME: This looks broken.  */
5459
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5460
{
5461
    int64_t v;
5462

    
5463
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5464
    v += float64_val(a);
5465
    v = float64_to_int64(make_float64(v) STATUS_VAR);
5466

    
5467
    return v - INT64_MIN;
5468
}
5469

    
5470
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5471
{
5472
    int64_t v;
5473

    
5474
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5475
    v += float64_val(a);
5476
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5477

    
5478
    return v - INT64_MIN;
5479
}
5480

    
5481
#define COMPARE(s, nan_exp)                                                  \
5482
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5483
                                      int is_quiet STATUS_PARAM )            \
5484
{                                                                            \
5485
    flag aSign, bSign;                                                       \
5486
    bits ## s av, bv;                                                        \
5487
                                                                             \
5488
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5489
         extractFloat ## s ## Frac( a ) ) ||                                 \
5490
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5491
          extractFloat ## s ## Frac( b ) )) {                                \
5492
        if (!is_quiet ||                                                     \
5493
            float ## s ## _is_signaling_nan( a ) ||                          \
5494
            float ## s ## _is_signaling_nan( b ) ) {                         \
5495
            float_raise( float_flag_invalid STATUS_VAR);                     \
5496
        }                                                                    \
5497
        return float_relation_unordered;                                     \
5498
    }                                                                        \
5499
    aSign = extractFloat ## s ## Sign( a );                                  \
5500
    bSign = extractFloat ## s ## Sign( b );                                  \
5501
    av = float ## s ## _val(a);                                              \
5502
    bv = float ## s ## _val(b);                                              \
5503
    if ( aSign != bSign ) {                                                  \
5504
        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5505
            /* zero case */                                                  \
5506
            return float_relation_equal;                                     \
5507
        } else {                                                             \
5508
            return 1 - (2 * aSign);                                          \
5509
        }                                                                    \
5510
    } else {                                                                 \
5511
        if (av == bv) {                                                      \
5512
            return float_relation_equal;                                     \
5513
        } else {                                                             \
5514
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5515
        }                                                                    \
5516
    }                                                                        \
5517
}                                                                            \
5518
                                                                             \
5519
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5520
{                                                                            \
5521
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5522
}                                                                            \
5523
                                                                             \
5524
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5525
{                                                                            \
5526
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5527
}
5528

    
5529
COMPARE(32, 0xff)
5530
COMPARE(64, 0x7ff)
5531

    
5532
INLINE int float128_compare_internal( float128 a, float128 b,
5533
                                      int is_quiet STATUS_PARAM )
5534
{
5535
    flag aSign, bSign;
5536

    
5537
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5538
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5539
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5540
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5541
        if (!is_quiet ||
5542
            float128_is_signaling_nan( a ) ||
5543
            float128_is_signaling_nan( b ) ) {
5544
            float_raise( float_flag_invalid STATUS_VAR);
5545
        }
5546
        return float_relation_unordered;
5547
    }
5548
    aSign = extractFloat128Sign( a );
5549
    bSign = extractFloat128Sign( b );
5550
    if ( aSign != bSign ) {
5551
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5552
            /* zero case */
5553
            return float_relation_equal;
5554
        } else {
5555
            return 1 - (2 * aSign);
5556
        }
5557
    } else {
5558
        if (a.low == b.low && a.high == b.high) {
5559
            return float_relation_equal;
5560
        } else {
5561
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5562
        }
5563
    }
5564
}
5565

    
5566
int float128_compare( float128 a, float128 b STATUS_PARAM )
5567
{
5568
    return float128_compare_internal(a, b, 0 STATUS_VAR);
5569
}
5570

    
5571
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5572
{
5573
    return float128_compare_internal(a, b, 1 STATUS_VAR);
5574
}
5575

    
5576
/* Multiply A by 2 raised to the power N.  */
5577
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5578
{
5579
    flag aSign;
5580
    int16 aExp;
5581
    bits32 aSig;
5582

    
5583
    aSig = extractFloat32Frac( a );
5584
    aExp = extractFloat32Exp( a );
5585
    aSign = extractFloat32Sign( a );
5586

    
5587
    if ( aExp == 0xFF ) {
5588
        return a;
5589
    }
5590
    if ( aExp != 0 )
5591
        aSig |= 0x00800000;
5592
    else if ( aSig == 0 )
5593
        return a;
5594

    
5595
    aExp += n - 1;
5596
    aSig <<= 7;
5597
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5598
}
5599

    
5600
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5601
{
5602
    flag aSign;
5603
    int16 aExp;
5604
    bits64 aSig;
5605

    
5606
    aSig = extractFloat64Frac( a );
5607
    aExp = extractFloat64Exp( a );
5608
    aSign = extractFloat64Sign( a );
5609

    
5610
    if ( aExp == 0x7FF ) {
5611
        return a;
5612
    }
5613
    if ( aExp != 0 )
5614
        aSig |= LIT64( 0x0010000000000000 );
5615
    else if ( aSig == 0 )
5616
        return a;
5617

    
5618
    aExp += n - 1;
5619
    aSig <<= 10;
5620
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5621
}
5622

    
5623
#ifdef FLOATX80
5624
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5625
{
5626
    flag aSign;
5627
    int16 aExp;
5628
    bits64 aSig;
5629

    
5630
    aSig = extractFloatx80Frac( a );
5631
    aExp = extractFloatx80Exp( a );
5632
    aSign = extractFloatx80Sign( a );
5633

    
5634
    if ( aExp == 0x7FF ) {
5635
        return a;
5636
    }
5637
    if (aExp == 0 && aSig == 0)
5638
        return a;
5639

    
5640
    aExp += n;
5641
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5642
                                          aSign, aExp, aSig, 0 STATUS_VAR );
5643
}
5644
#endif
5645

    
5646
#ifdef FLOAT128
5647
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5648
{
5649
    flag aSign;
5650
    int32 aExp;
5651
    bits64 aSig0, aSig1;
5652

    
5653
    aSig1 = extractFloat128Frac1( a );
5654
    aSig0 = extractFloat128Frac0( a );
5655
    aExp = extractFloat128Exp( a );
5656
    aSign = extractFloat128Sign( a );
5657
    if ( aExp == 0x7FFF ) {
5658
        return a;
5659
    }
5660
    if ( aExp != 0 )
5661
        aSig0 |= LIT64( 0x0001000000000000 );
5662
    else if ( aSig0 == 0 && aSig1 == 0 )
5663
        return a;
5664

    
5665
    aExp += n - 1;
5666
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5667
                                          STATUS_VAR );
5668

    
5669
}
5670
#endif