Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 1f587329

History | View | Annotate | Download (194.2 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
#include "softfloat.h"
34

    
35
/*----------------------------------------------------------------------------
36
| Primitive arithmetic functions, including multi-word arithmetic, and
37
| division and square root approximations.  (Can be specialized to target if
38
| desired.)
39
*----------------------------------------------------------------------------*/
40
#include "softfloat-macros.h"
41

    
42
/*----------------------------------------------------------------------------
43
| Functions and definitions to determine:  (1) whether tininess for underflow
44
| is detected before or after rounding by default, (2) what (if anything)
45
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47
| are propagated from function inputs to output.  These details are target-
48
| specific.
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-specialize.h"
51

    
52
void set_float_rounding_mode(int val STATUS_PARAM)
53
{
54
    STATUS(float_rounding_mode) = val;
55
}
56

    
57
void set_float_exception_flags(int val STATUS_PARAM)
58
{
59
    STATUS(float_exception_flags) = val;
60
}
61

    
62
#ifdef FLOATX80
63
void set_floatx80_rounding_precision(int val STATUS_PARAM)
64
{
65
    STATUS(floatx80_rounding_precision) = val;
66
}
67
#endif
68

    
69
/*----------------------------------------------------------------------------
70
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71
| and 7, and returns the properly rounded 32-bit integer corresponding to the
72
| input.  If `zSign' is 1, the input is negated before being converted to an
73
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
74
| is simply rounded to an integer, with the inexact exception raised if the
75
| input cannot be represented exactly as an integer.  However, if the fixed-
76
| point input is too large, the invalid exception is raised and the largest
77
| positive or negative integer is returned.
78
*----------------------------------------------------------------------------*/
79

    
80
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81
{
82
    int8 roundingMode;
83
    flag roundNearestEven;
84
    int8 roundIncrement, roundBits;
85
    int32 z;
86

    
87
    roundingMode = STATUS(float_rounding_mode);
88
    roundNearestEven = ( roundingMode == float_round_nearest_even );
89
    roundIncrement = 0x40;
90
    if ( ! roundNearestEven ) {
91
        if ( roundingMode == float_round_to_zero ) {
92
            roundIncrement = 0;
93
        }
94
        else {
95
            roundIncrement = 0x7F;
96
            if ( zSign ) {
97
                if ( roundingMode == float_round_up ) roundIncrement = 0;
98
            }
99
            else {
100
                if ( roundingMode == float_round_down ) roundIncrement = 0;
101
            }
102
        }
103
    }
104
    roundBits = absZ & 0x7F;
105
    absZ = ( absZ + roundIncrement )>>7;
106
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107
    z = absZ;
108
    if ( zSign ) z = - z;
109
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110
        float_raise( float_flag_invalid STATUS_VAR);
111
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112
    }
113
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114
    return z;
115

    
116
}
117

    
118
/*----------------------------------------------------------------------------
119
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120
| `absZ1', with binary point between bits 63 and 64 (between the input words),
121
| and returns the properly rounded 64-bit integer corresponding to the input.
122
| If `zSign' is 1, the input is negated before being converted to an integer.
123
| Ordinarily, the fixed-point input is simply rounded to an integer, with
124
| the inexact exception raised if the input cannot be represented exactly as
125
| an integer.  However, if the fixed-point input is too large, the invalid
126
| exception is raised and the largest positive or negative integer is
127
| returned.
128
*----------------------------------------------------------------------------*/
129

    
130
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131
{
132
    int8 roundingMode;
133
    flag roundNearestEven, increment;
134
    int64 z;
135

    
136
    roundingMode = STATUS(float_rounding_mode);
137
    roundNearestEven = ( roundingMode == float_round_nearest_even );
138
    increment = ( (sbits64) absZ1 < 0 );
139
    if ( ! roundNearestEven ) {
140
        if ( roundingMode == float_round_to_zero ) {
141
            increment = 0;
142
        }
143
        else {
144
            if ( zSign ) {
145
                increment = ( roundingMode == float_round_down ) && absZ1;
146
            }
147
            else {
148
                increment = ( roundingMode == float_round_up ) && absZ1;
149
            }
150
        }
151
    }
152
    if ( increment ) {
153
        ++absZ0;
154
        if ( absZ0 == 0 ) goto overflow;
155
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156
    }
157
    z = absZ0;
158
    if ( zSign ) z = - z;
159
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
160
 overflow:
161
        float_raise( float_flag_invalid STATUS_VAR);
162
        return
163
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
164
            : LIT64( 0x7FFFFFFFFFFFFFFF );
165
    }
166
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167
    return z;
168

    
169
}
170

    
171
/*----------------------------------------------------------------------------
172
| Returns the fraction bits of the single-precision floating-point value `a'.
173
*----------------------------------------------------------------------------*/
174

    
175
INLINE bits32 extractFloat32Frac( float32 a )
176
{
177

    
178
    return float32_val(a) & 0x007FFFFF;
179

    
180
}
181

    
182
/*----------------------------------------------------------------------------
183
| Returns the exponent bits of the single-precision floating-point value `a'.
184
*----------------------------------------------------------------------------*/
185

    
186
INLINE int16 extractFloat32Exp( float32 a )
187
{
188

    
189
    return ( float32_val(a)>>23 ) & 0xFF;
190

    
191
}
192

    
193
/*----------------------------------------------------------------------------
194
| Returns the sign bit of the single-precision floating-point value `a'.
195
*----------------------------------------------------------------------------*/
196

    
197
INLINE flag extractFloat32Sign( float32 a )
198
{
199

    
200
    return float32_val(a)>>31;
201

    
202
}
203

    
204
/*----------------------------------------------------------------------------
205
| Normalizes the subnormal single-precision floating-point value represented
206
| by the denormalized significand `aSig'.  The normalized exponent and
207
| significand are stored at the locations pointed to by `zExpPtr' and
208
| `zSigPtr', respectively.
209
*----------------------------------------------------------------------------*/
210

    
211
static void
212
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213
{
214
    int8 shiftCount;
215

    
216
    shiftCount = countLeadingZeros32( aSig ) - 8;
217
    *zSigPtr = aSig<<shiftCount;
218
    *zExpPtr = 1 - shiftCount;
219

    
220
}
221

    
222
/*----------------------------------------------------------------------------
223
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224
| single-precision floating-point value, returning the result.  After being
225
| shifted into the proper positions, the three fields are simply added
226
| together to form the result.  This means that any integer portion of `zSig'
227
| will be added into the exponent.  Since a properly normalized significand
228
| will have an integer portion equal to 1, the `zExp' input should be 1 less
229
| than the desired result exponent whenever `zSig' is a complete, normalized
230
| significand.
231
*----------------------------------------------------------------------------*/
232

    
233
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234
{
235

    
236
    return make_float32(
237
          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
238

    
239
}
240

    
241
/*----------------------------------------------------------------------------
242
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
243
| and significand `zSig', and returns the proper single-precision floating-
244
| point value corresponding to the abstract input.  Ordinarily, the abstract
245
| value is simply rounded and packed into the single-precision format, with
246
| the inexact exception raised if the abstract input cannot be represented
247
| exactly.  However, if the abstract value is too large, the overflow and
248
| inexact exceptions are raised and an infinity or maximal finite value is
249
| returned.  If the abstract value is too small, the input value is rounded to
250
| a subnormal number, and the underflow and inexact exceptions are raised if
251
| the abstract input cannot be represented exactly as a subnormal single-
252
| precision floating-point number.
253
|     The input significand `zSig' has its binary point between bits 30
254
| and 29, which is 7 bits to the left of the usual location.  This shifted
255
| significand must be normalized or smaller.  If `zSig' is not normalized,
256
| `zExp' must be 0; in that case, the result returned is a subnormal number,
257
| and it must not require rounding.  In the usual case that `zSig' is
258
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
259
| The handling of underflow and overflow follows the IEC/IEEE Standard for
260
| Binary Floating-Point Arithmetic.
261
*----------------------------------------------------------------------------*/
262

    
263
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
264
{
265
    int8 roundingMode;
266
    flag roundNearestEven;
267
    int8 roundIncrement, roundBits;
268
    flag isTiny;
269

    
270
    roundingMode = STATUS(float_rounding_mode);
271
    roundNearestEven = ( roundingMode == float_round_nearest_even );
272
    roundIncrement = 0x40;
273
    if ( ! roundNearestEven ) {
274
        if ( roundingMode == float_round_to_zero ) {
275
            roundIncrement = 0;
276
        }
277
        else {
278
            roundIncrement = 0x7F;
279
            if ( zSign ) {
280
                if ( roundingMode == float_round_up ) roundIncrement = 0;
281
            }
282
            else {
283
                if ( roundingMode == float_round_down ) roundIncrement = 0;
284
            }
285
        }
286
    }
287
    roundBits = zSig & 0x7F;
288
    if ( 0xFD <= (bits16) zExp ) {
289
        if (    ( 0xFD < zExp )
290
             || (    ( zExp == 0xFD )
291
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
292
           ) {
293
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
294
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
295
        }
296
        if ( zExp < 0 ) {
297
            isTiny =
298
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
299
                || ( zExp < -1 )
300
                || ( zSig + roundIncrement < 0x80000000 );
301
            shift32RightJamming( zSig, - zExp, &zSig );
302
            zExp = 0;
303
            roundBits = zSig & 0x7F;
304
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
305
        }
306
    }
307
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
308
    zSig = ( zSig + roundIncrement )>>7;
309
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
310
    if ( zSig == 0 ) zExp = 0;
311
    return packFloat32( zSign, zExp, zSig );
312

    
313
}
314

    
315
/*----------------------------------------------------------------------------
316
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
317
| and significand `zSig', and returns the proper single-precision floating-
318
| point value corresponding to the abstract input.  This routine is just like
319
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
320
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
321
| floating-point exponent.
322
*----------------------------------------------------------------------------*/
323

    
324
static float32
325
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
326
{
327
    int8 shiftCount;
328

    
329
    shiftCount = countLeadingZeros32( zSig ) - 1;
330
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
331

    
332
}
333

    
334
/*----------------------------------------------------------------------------
335
| Returns the fraction bits of the double-precision floating-point value `a'.
336
*----------------------------------------------------------------------------*/
337

    
338
INLINE bits64 extractFloat64Frac( float64 a )
339
{
340

    
341
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
342

    
343
}
344

    
345
/*----------------------------------------------------------------------------
346
| Returns the exponent bits of the double-precision floating-point value `a'.
347
*----------------------------------------------------------------------------*/
348

    
349
INLINE int16 extractFloat64Exp( float64 a )
350
{
351

    
352
    return ( float64_val(a)>>52 ) & 0x7FF;
353

    
354
}
355

    
356
/*----------------------------------------------------------------------------
357
| Returns the sign bit of the double-precision floating-point value `a'.
358
*----------------------------------------------------------------------------*/
359

    
360
INLINE flag extractFloat64Sign( float64 a )
361
{
362

    
363
    return float64_val(a)>>63;
364

    
365
}
366

    
367
/*----------------------------------------------------------------------------
368
| Normalizes the subnormal double-precision floating-point value represented
369
| by the denormalized significand `aSig'.  The normalized exponent and
370
| significand are stored at the locations pointed to by `zExpPtr' and
371
| `zSigPtr', respectively.
372
*----------------------------------------------------------------------------*/
373

    
374
static void
375
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
376
{
377
    int8 shiftCount;
378

    
379
    shiftCount = countLeadingZeros64( aSig ) - 11;
380
    *zSigPtr = aSig<<shiftCount;
381
    *zExpPtr = 1 - shiftCount;
382

    
383
}
384

    
385
/*----------------------------------------------------------------------------
386
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
387
| double-precision floating-point value, returning the result.  After being
388
| shifted into the proper positions, the three fields are simply added
389
| together to form the result.  This means that any integer portion of `zSig'
390
| will be added into the exponent.  Since a properly normalized significand
391
| will have an integer portion equal to 1, the `zExp' input should be 1 less
392
| than the desired result exponent whenever `zSig' is a complete, normalized
393
| significand.
394
*----------------------------------------------------------------------------*/
395

    
396
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
397
{
398

    
399
    return make_float64(
400
        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
401

    
402
}
403

    
404
/*----------------------------------------------------------------------------
405
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406
| and significand `zSig', and returns the proper double-precision floating-
407
| point value corresponding to the abstract input.  Ordinarily, the abstract
408
| value is simply rounded and packed into the double-precision format, with
409
| the inexact exception raised if the abstract input cannot be represented
410
| exactly.  However, if the abstract value is too large, the overflow and
411
| inexact exceptions are raised and an infinity or maximal finite value is
412
| returned.  If the abstract value is too small, the input value is rounded
413
| to a subnormal number, and the underflow and inexact exceptions are raised
414
| if the abstract input cannot be represented exactly as a subnormal double-
415
| precision floating-point number.
416
|     The input significand `zSig' has its binary point between bits 62
417
| and 61, which is 10 bits to the left of the usual location.  This shifted
418
| significand must be normalized or smaller.  If `zSig' is not normalized,
419
| `zExp' must be 0; in that case, the result returned is a subnormal number,
420
| and it must not require rounding.  In the usual case that `zSig' is
421
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
422
| The handling of underflow and overflow follows the IEC/IEEE Standard for
423
| Binary Floating-Point Arithmetic.
424
*----------------------------------------------------------------------------*/
425

    
426
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
427
{
428
    int8 roundingMode;
429
    flag roundNearestEven;
430
    int16 roundIncrement, roundBits;
431
    flag isTiny;
432

    
433
    roundingMode = STATUS(float_rounding_mode);
434
    roundNearestEven = ( roundingMode == float_round_nearest_even );
435
    roundIncrement = 0x200;
436
    if ( ! roundNearestEven ) {
437
        if ( roundingMode == float_round_to_zero ) {
438
            roundIncrement = 0;
439
        }
440
        else {
441
            roundIncrement = 0x3FF;
442
            if ( zSign ) {
443
                if ( roundingMode == float_round_up ) roundIncrement = 0;
444
            }
445
            else {
446
                if ( roundingMode == float_round_down ) roundIncrement = 0;
447
            }
448
        }
449
    }
450
    roundBits = zSig & 0x3FF;
451
    if ( 0x7FD <= (bits16) zExp ) {
452
        if (    ( 0x7FD < zExp )
453
             || (    ( zExp == 0x7FD )
454
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
455
           ) {
456
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
457
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
458
        }
459
        if ( zExp < 0 ) {
460
            isTiny =
461
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
462
                || ( zExp < -1 )
463
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
464
            shift64RightJamming( zSig, - zExp, &zSig );
465
            zExp = 0;
466
            roundBits = zSig & 0x3FF;
467
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
468
        }
469
    }
470
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
471
    zSig = ( zSig + roundIncrement )>>10;
472
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
473
    if ( zSig == 0 ) zExp = 0;
474
    return packFloat64( zSign, zExp, zSig );
475

    
476
}
477

    
478
/*----------------------------------------------------------------------------
479
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
480
| and significand `zSig', and returns the proper double-precision floating-
481
| point value corresponding to the abstract input.  This routine is just like
482
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
483
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
484
| floating-point exponent.
485
*----------------------------------------------------------------------------*/
486

    
487
static float64
488
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
489
{
490
    int8 shiftCount;
491

    
492
    shiftCount = countLeadingZeros64( zSig ) - 1;
493
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
494

    
495
}
496

    
497
#ifdef FLOATX80
498

    
499
/*----------------------------------------------------------------------------
500
| Returns the fraction bits of the extended double-precision floating-point
501
| value `a'.
502
*----------------------------------------------------------------------------*/
503

    
504
INLINE bits64 extractFloatx80Frac( floatx80 a )
505
{
506

    
507
    return a.low;
508

    
509
}
510

    
511
/*----------------------------------------------------------------------------
512
| Returns the exponent bits of the extended double-precision floating-point
513
| value `a'.
514
*----------------------------------------------------------------------------*/
515

    
516
INLINE int32 extractFloatx80Exp( floatx80 a )
517
{
518

    
519
    return a.high & 0x7FFF;
520

    
521
}
522

    
523
/*----------------------------------------------------------------------------
524
| Returns the sign bit of the extended double-precision floating-point value
525
| `a'.
526
*----------------------------------------------------------------------------*/
527

    
528
INLINE flag extractFloatx80Sign( floatx80 a )
529
{
530

    
531
    return a.high>>15;
532

    
533
}
534

    
535
/*----------------------------------------------------------------------------
536
| Normalizes the subnormal extended double-precision floating-point value
537
| represented by the denormalized significand `aSig'.  The normalized exponent
538
| and significand are stored at the locations pointed to by `zExpPtr' and
539
| `zSigPtr', respectively.
540
*----------------------------------------------------------------------------*/
541

    
542
static void
543
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
544
{
545
    int8 shiftCount;
546

    
547
    shiftCount = countLeadingZeros64( aSig );
548
    *zSigPtr = aSig<<shiftCount;
549
    *zExpPtr = 1 - shiftCount;
550

    
551
}
552

    
553
/*----------------------------------------------------------------------------
554
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
555
| extended double-precision floating-point value, returning the result.
556
*----------------------------------------------------------------------------*/
557

    
558
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
559
{
560
    floatx80 z;
561

    
562
    z.low = zSig;
563
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
564
    return z;
565

    
566
}
567

    
568
/*----------------------------------------------------------------------------
569
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
571
| and returns the proper extended double-precision floating-point value
572
| corresponding to the abstract input.  Ordinarily, the abstract value is
573
| rounded and packed into the extended double-precision format, with the
574
| inexact exception raised if the abstract input cannot be represented
575
| exactly.  However, if the abstract value is too large, the overflow and
576
| inexact exceptions are raised and an infinity or maximal finite value is
577
| returned.  If the abstract value is too small, the input value is rounded to
578
| a subnormal number, and the underflow and inexact exceptions are raised if
579
| the abstract input cannot be represented exactly as a subnormal extended
580
| double-precision floating-point number.
581
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
582
| number of bits as single or double precision, respectively.  Otherwise, the
583
| result is rounded to the full precision of the extended double-precision
584
| format.
585
|     The input significand must be normalized or smaller.  If the input
586
| significand is not normalized, `zExp' must be 0; in that case, the result
587
| returned is a subnormal number, and it must not require rounding.  The
588
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
589
| Floating-Point Arithmetic.
590
*----------------------------------------------------------------------------*/
591

    
592
static floatx80
593
 roundAndPackFloatx80(
594
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
595
 STATUS_PARAM)
596
{
597
    int8 roundingMode;
598
    flag roundNearestEven, increment, isTiny;
599
    int64 roundIncrement, roundMask, roundBits;
600

    
601
    roundingMode = STATUS(float_rounding_mode);
602
    roundNearestEven = ( roundingMode == float_round_nearest_even );
603
    if ( roundingPrecision == 80 ) goto precision80;
604
    if ( roundingPrecision == 64 ) {
605
        roundIncrement = LIT64( 0x0000000000000400 );
606
        roundMask = LIT64( 0x00000000000007FF );
607
    }
608
    else if ( roundingPrecision == 32 ) {
609
        roundIncrement = LIT64( 0x0000008000000000 );
610
        roundMask = LIT64( 0x000000FFFFFFFFFF );
611
    }
612
    else {
613
        goto precision80;
614
    }
615
    zSig0 |= ( zSig1 != 0 );
616
    if ( ! roundNearestEven ) {
617
        if ( roundingMode == float_round_to_zero ) {
618
            roundIncrement = 0;
619
        }
620
        else {
621
            roundIncrement = roundMask;
622
            if ( zSign ) {
623
                if ( roundingMode == float_round_up ) roundIncrement = 0;
624
            }
625
            else {
626
                if ( roundingMode == float_round_down ) roundIncrement = 0;
627
            }
628
        }
629
    }
630
    roundBits = zSig0 & roundMask;
631
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
632
        if (    ( 0x7FFE < zExp )
633
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
634
           ) {
635
            goto overflow;
636
        }
637
        if ( zExp <= 0 ) {
638
            isTiny =
639
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
640
                || ( zExp < 0 )
641
                || ( zSig0 <= zSig0 + roundIncrement );
642
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
643
            zExp = 0;
644
            roundBits = zSig0 & roundMask;
645
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
646
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
647
            zSig0 += roundIncrement;
648
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
649
            roundIncrement = roundMask + 1;
650
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
651
                roundMask |= roundIncrement;
652
            }
653
            zSig0 &= ~ roundMask;
654
            return packFloatx80( zSign, zExp, zSig0 );
655
        }
656
    }
657
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
658
    zSig0 += roundIncrement;
659
    if ( zSig0 < roundIncrement ) {
660
        ++zExp;
661
        zSig0 = LIT64( 0x8000000000000000 );
662
    }
663
    roundIncrement = roundMask + 1;
664
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
665
        roundMask |= roundIncrement;
666
    }
667
    zSig0 &= ~ roundMask;
668
    if ( zSig0 == 0 ) zExp = 0;
669
    return packFloatx80( zSign, zExp, zSig0 );
670
 precision80:
671
    increment = ( (sbits64) zSig1 < 0 );
672
    if ( ! roundNearestEven ) {
673
        if ( roundingMode == float_round_to_zero ) {
674
            increment = 0;
675
        }
676
        else {
677
            if ( zSign ) {
678
                increment = ( roundingMode == float_round_down ) && zSig1;
679
            }
680
            else {
681
                increment = ( roundingMode == float_round_up ) && zSig1;
682
            }
683
        }
684
    }
685
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686
        if (    ( 0x7FFE < zExp )
687
             || (    ( zExp == 0x7FFE )
688
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
689
                  && increment
690
                )
691
           ) {
692
            roundMask = 0;
693
 overflow:
694
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
695
            if (    ( roundingMode == float_round_to_zero )
696
                 || ( zSign && ( roundingMode == float_round_up ) )
697
                 || ( ! zSign && ( roundingMode == float_round_down ) )
698
               ) {
699
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
700
            }
701
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
702
        }
703
        if ( zExp <= 0 ) {
704
            isTiny =
705
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
706
                || ( zExp < 0 )
707
                || ! increment
708
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
709
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
710
            zExp = 0;
711
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
712
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
713
            if ( roundNearestEven ) {
714
                increment = ( (sbits64) zSig1 < 0 );
715
            }
716
            else {
717
                if ( zSign ) {
718
                    increment = ( roundingMode == float_round_down ) && zSig1;
719
                }
720
                else {
721
                    increment = ( roundingMode == float_round_up ) && zSig1;
722
                }
723
            }
724
            if ( increment ) {
725
                ++zSig0;
726
                zSig0 &=
727
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
728
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
729
            }
730
            return packFloatx80( zSign, zExp, zSig0 );
731
        }
732
    }
733
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
734
    if ( increment ) {
735
        ++zSig0;
736
        if ( zSig0 == 0 ) {
737
            ++zExp;
738
            zSig0 = LIT64( 0x8000000000000000 );
739
        }
740
        else {
741
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
742
        }
743
    }
744
    else {
745
        if ( zSig0 == 0 ) zExp = 0;
746
    }
747
    return packFloatx80( zSign, zExp, zSig0 );
748

    
749
}
750

    
751
/*----------------------------------------------------------------------------
752
| Takes an abstract floating-point value having sign `zSign', exponent
753
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
754
| and returns the proper extended double-precision floating-point value
755
| corresponding to the abstract input.  This routine is just like
756
| `roundAndPackFloatx80' except that the input significand does not have to be
757
| normalized.
758
*----------------------------------------------------------------------------*/
759

    
760
static floatx80
761
 normalizeRoundAndPackFloatx80(
762
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
763
 STATUS_PARAM)
764
{
765
    int8 shiftCount;
766

    
767
    if ( zSig0 == 0 ) {
768
        zSig0 = zSig1;
769
        zSig1 = 0;
770
        zExp -= 64;
771
    }
772
    shiftCount = countLeadingZeros64( zSig0 );
773
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
774
    zExp -= shiftCount;
775
    return
776
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
777

    
778
}
779

    
780
#endif
781

    
782
#ifdef FLOAT128
783

    
784
/*----------------------------------------------------------------------------
785
| Returns the least-significant 64 fraction bits of the quadruple-precision
786
| floating-point value `a'.
787
*----------------------------------------------------------------------------*/
788

    
789
INLINE bits64 extractFloat128Frac1( float128 a )
790
{
791

    
792
    return a.low;
793

    
794
}
795

    
796
/*----------------------------------------------------------------------------
797
| Returns the most-significant 48 fraction bits of the quadruple-precision
798
| floating-point value `a'.
799
*----------------------------------------------------------------------------*/
800

    
801
INLINE bits64 extractFloat128Frac0( float128 a )
802
{
803

    
804
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
805

    
806
}
807

    
808
/*----------------------------------------------------------------------------
809
| Returns the exponent bits of the quadruple-precision floating-point value
810
| `a'.
811
*----------------------------------------------------------------------------*/
812

    
813
INLINE int32 extractFloat128Exp( float128 a )
814
{
815

    
816
    return ( a.high>>48 ) & 0x7FFF;
817

    
818
}
819

    
820
/*----------------------------------------------------------------------------
821
| Returns the sign bit of the quadruple-precision floating-point value `a'.
822
*----------------------------------------------------------------------------*/
823

    
824
INLINE flag extractFloat128Sign( float128 a )
825
{
826

    
827
    return a.high>>63;
828

    
829
}
830

    
831
/*----------------------------------------------------------------------------
832
| Normalizes the subnormal quadruple-precision floating-point value
833
| represented by the denormalized significand formed by the concatenation of
834
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
835
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
836
| significand are stored at the location pointed to by `zSig0Ptr', and the
837
| least significant 64 bits of the normalized significand are stored at the
838
| location pointed to by `zSig1Ptr'.
839
*----------------------------------------------------------------------------*/
840

    
841
static void
842
 normalizeFloat128Subnormal(
843
     bits64 aSig0,
844
     bits64 aSig1,
845
     int32 *zExpPtr,
846
     bits64 *zSig0Ptr,
847
     bits64 *zSig1Ptr
848
 )
849
{
850
    int8 shiftCount;
851

    
852
    if ( aSig0 == 0 ) {
853
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
854
        if ( shiftCount < 0 ) {
855
            *zSig0Ptr = aSig1>>( - shiftCount );
856
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
857
        }
858
        else {
859
            *zSig0Ptr = aSig1<<shiftCount;
860
            *zSig1Ptr = 0;
861
        }
862
        *zExpPtr = - shiftCount - 63;
863
    }
864
    else {
865
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
866
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
867
        *zExpPtr = 1 - shiftCount;
868
    }
869

    
870
}
871

    
872
/*----------------------------------------------------------------------------
873
| Packs the sign `zSign', the exponent `zExp', and the significand formed
874
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
875
| floating-point value, returning the result.  After being shifted into the
876
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
877
| added together to form the most significant 32 bits of the result.  This
878
| means that any integer portion of `zSig0' will be added into the exponent.
879
| Since a properly normalized significand will have an integer portion equal
880
| to 1, the `zExp' input should be 1 less than the desired result exponent
881
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
882
| significand.
883
*----------------------------------------------------------------------------*/
884

    
885
INLINE float128
886
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
887
{
888
    float128 z;
889

    
890
    z.low = zSig1;
891
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
892
    return z;
893

    
894
}
895

    
896
/*----------------------------------------------------------------------------
897
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
898
| and extended significand formed by the concatenation of `zSig0', `zSig1',
899
| and `zSig2', and returns the proper quadruple-precision floating-point value
900
| corresponding to the abstract input.  Ordinarily, the abstract value is
901
| simply rounded and packed into the quadruple-precision format, with the
902
| inexact exception raised if the abstract input cannot be represented
903
| exactly.  However, if the abstract value is too large, the overflow and
904
| inexact exceptions are raised and an infinity or maximal finite value is
905
| returned.  If the abstract value is too small, the input value is rounded to
906
| a subnormal number, and the underflow and inexact exceptions are raised if
907
| the abstract input cannot be represented exactly as a subnormal quadruple-
908
| precision floating-point number.
909
|     The input significand must be normalized or smaller.  If the input
910
| significand is not normalized, `zExp' must be 0; in that case, the result
911
| returned is a subnormal number, and it must not require rounding.  In the
912
| usual case that the input significand is normalized, `zExp' must be 1 less
913
| than the ``true'' floating-point exponent.  The handling of underflow and
914
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
915
*----------------------------------------------------------------------------*/
916

    
917
static float128
918
 roundAndPackFloat128(
919
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
920
{
921
    int8 roundingMode;
922
    flag roundNearestEven, increment, isTiny;
923

    
924
    roundingMode = STATUS(float_rounding_mode);
925
    roundNearestEven = ( roundingMode == float_round_nearest_even );
926
    increment = ( (sbits64) zSig2 < 0 );
927
    if ( ! roundNearestEven ) {
928
        if ( roundingMode == float_round_to_zero ) {
929
            increment = 0;
930
        }
931
        else {
932
            if ( zSign ) {
933
                increment = ( roundingMode == float_round_down ) && zSig2;
934
            }
935
            else {
936
                increment = ( roundingMode == float_round_up ) && zSig2;
937
            }
938
        }
939
    }
940
    if ( 0x7FFD <= (bits32) zExp ) {
941
        if (    ( 0x7FFD < zExp )
942
             || (    ( zExp == 0x7FFD )
943
                  && eq128(
944
                         LIT64( 0x0001FFFFFFFFFFFF ),
945
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
946
                         zSig0,
947
                         zSig1
948
                     )
949
                  && increment
950
                )
951
           ) {
952
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
953
            if (    ( roundingMode == float_round_to_zero )
954
                 || ( zSign && ( roundingMode == float_round_up ) )
955
                 || ( ! zSign && ( roundingMode == float_round_down ) )
956
               ) {
957
                return
958
                    packFloat128(
959
                        zSign,
960
                        0x7FFE,
961
                        LIT64( 0x0000FFFFFFFFFFFF ),
962
                        LIT64( 0xFFFFFFFFFFFFFFFF )
963
                    );
964
            }
965
            return packFloat128( zSign, 0x7FFF, 0, 0 );
966
        }
967
        if ( zExp < 0 ) {
968
            isTiny =
969
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
970
                || ( zExp < -1 )
971
                || ! increment
972
                || lt128(
973
                       zSig0,
974
                       zSig1,
975
                       LIT64( 0x0001FFFFFFFFFFFF ),
976
                       LIT64( 0xFFFFFFFFFFFFFFFF )
977
                   );
978
            shift128ExtraRightJamming(
979
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
980
            zExp = 0;
981
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
982
            if ( roundNearestEven ) {
983
                increment = ( (sbits64) zSig2 < 0 );
984
            }
985
            else {
986
                if ( zSign ) {
987
                    increment = ( roundingMode == float_round_down ) && zSig2;
988
                }
989
                else {
990
                    increment = ( roundingMode == float_round_up ) && zSig2;
991
                }
992
            }
993
        }
994
    }
995
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
996
    if ( increment ) {
997
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
998
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
999
    }
1000
    else {
1001
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1002
    }
1003
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1004

    
1005
}
1006

    
1007
/*----------------------------------------------------------------------------
1008
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1009
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1010
| returns the proper quadruple-precision floating-point value corresponding
1011
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1012
| except that the input significand has fewer bits and does not have to be
1013
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1014
| point exponent.
1015
*----------------------------------------------------------------------------*/
1016

    
1017
static float128
1018
 normalizeRoundAndPackFloat128(
1019
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1020
{
1021
    int8 shiftCount;
1022
    bits64 zSig2;
1023

    
1024
    if ( zSig0 == 0 ) {
1025
        zSig0 = zSig1;
1026
        zSig1 = 0;
1027
        zExp -= 64;
1028
    }
1029
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1030
    if ( 0 <= shiftCount ) {
1031
        zSig2 = 0;
1032
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1033
    }
1034
    else {
1035
        shift128ExtraRightJamming(
1036
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1037
    }
1038
    zExp -= shiftCount;
1039
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1040

    
1041
}
1042

    
1043
#endif
1044

    
1045
/*----------------------------------------------------------------------------
1046
| Returns the result of converting the 32-bit two's complement integer `a'
1047
| to the single-precision floating-point format.  The conversion is performed
1048
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1049
*----------------------------------------------------------------------------*/
1050

    
1051
float32 int32_to_float32( int32 a STATUS_PARAM )
1052
{
1053
    flag zSign;
1054

    
1055
    if ( a == 0 ) return float32_zero;
1056
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1057
    zSign = ( a < 0 );
1058
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1059

    
1060
}
1061

    
1062
/*----------------------------------------------------------------------------
1063
| Returns the result of converting the 32-bit two's complement integer `a'
1064
| to the double-precision floating-point format.  The conversion is performed
1065
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1066
*----------------------------------------------------------------------------*/
1067

    
1068
float64 int32_to_float64( int32 a STATUS_PARAM )
1069
{
1070
    flag zSign;
1071
    uint32 absA;
1072
    int8 shiftCount;
1073
    bits64 zSig;
1074

    
1075
    if ( a == 0 ) return float64_zero;
1076
    zSign = ( a < 0 );
1077
    absA = zSign ? - a : a;
1078
    shiftCount = countLeadingZeros32( absA ) + 21;
1079
    zSig = absA;
1080
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1081

    
1082
}
1083

    
1084
#ifdef FLOATX80
1085

    
1086
/*----------------------------------------------------------------------------
1087
| Returns the result of converting the 32-bit two's complement integer `a'
1088
| to the extended double-precision floating-point format.  The conversion
1089
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1090
| Arithmetic.
1091
*----------------------------------------------------------------------------*/
1092

    
1093
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1094
{
1095
    flag zSign;
1096
    uint32 absA;
1097
    int8 shiftCount;
1098
    bits64 zSig;
1099

    
1100
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1101
    zSign = ( a < 0 );
1102
    absA = zSign ? - a : a;
1103
    shiftCount = countLeadingZeros32( absA ) + 32;
1104
    zSig = absA;
1105
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1106

    
1107
}
1108

    
1109
#endif
1110

    
1111
#ifdef FLOAT128
1112

    
1113
/*----------------------------------------------------------------------------
1114
| Returns the result of converting the 32-bit two's complement integer `a' to
1115
| the quadruple-precision floating-point format.  The conversion is performed
1116
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117
*----------------------------------------------------------------------------*/
1118

    
1119
float128 int32_to_float128( int32 a STATUS_PARAM )
1120
{
1121
    flag zSign;
1122
    uint32 absA;
1123
    int8 shiftCount;
1124
    bits64 zSig0;
1125

    
1126
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1127
    zSign = ( a < 0 );
1128
    absA = zSign ? - a : a;
1129
    shiftCount = countLeadingZeros32( absA ) + 17;
1130
    zSig0 = absA;
1131
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1132

    
1133
}
1134

    
1135
#endif
1136

    
1137
/*----------------------------------------------------------------------------
1138
| Returns the result of converting the 64-bit two's complement integer `a'
1139
| to the single-precision floating-point format.  The conversion is performed
1140
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1141
*----------------------------------------------------------------------------*/
1142

    
1143
float32 int64_to_float32( int64 a STATUS_PARAM )
1144
{
1145
    flag zSign;
1146
    uint64 absA;
1147
    int8 shiftCount;
1148

    
1149
    if ( a == 0 ) return float32_zero;
1150
    zSign = ( a < 0 );
1151
    absA = zSign ? - a : a;
1152
    shiftCount = countLeadingZeros64( absA ) - 40;
1153
    if ( 0 <= shiftCount ) {
1154
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1155
    }
1156
    else {
1157
        shiftCount += 7;
1158
        if ( shiftCount < 0 ) {
1159
            shift64RightJamming( absA, - shiftCount, &absA );
1160
        }
1161
        else {
1162
            absA <<= shiftCount;
1163
        }
1164
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1165
    }
1166

    
1167
}
1168

    
1169
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1170
{
1171
    int8 shiftCount;
1172

    
1173
    if ( a == 0 ) return float32_zero;
1174
    shiftCount = countLeadingZeros64( a ) - 40;
1175
    if ( 0 <= shiftCount ) {
1176
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1177
    }
1178
    else {
1179
        shiftCount += 7;
1180
        if ( shiftCount < 0 ) {
1181
            shift64RightJamming( a, - shiftCount, &a );
1182
        }
1183
        else {
1184
            a <<= shiftCount;
1185
        }
1186
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1187
    }
1188
}
1189

    
1190
/*----------------------------------------------------------------------------
1191
| Returns the result of converting the 64-bit two's complement integer `a'
1192
| to the double-precision floating-point format.  The conversion is performed
1193
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1194
*----------------------------------------------------------------------------*/
1195

    
1196
float64 int64_to_float64( int64 a STATUS_PARAM )
1197
{
1198
    flag zSign;
1199

    
1200
    if ( a == 0 ) return float64_zero;
1201
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1202
        return packFloat64( 1, 0x43E, 0 );
1203
    }
1204
    zSign = ( a < 0 );
1205
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1206

    
1207
}
1208

    
1209
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1210
{
1211
    if ( a == 0 ) return float64_zero;
1212
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1213

    
1214
}
1215

    
1216
#ifdef FLOATX80
1217

    
1218
/*----------------------------------------------------------------------------
1219
| Returns the result of converting the 64-bit two's complement integer `a'
1220
| to the extended double-precision floating-point format.  The conversion
1221
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1222
| Arithmetic.
1223
*----------------------------------------------------------------------------*/
1224

    
1225
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1226
{
1227
    flag zSign;
1228
    uint64 absA;
1229
    int8 shiftCount;
1230

    
1231
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1232
    zSign = ( a < 0 );
1233
    absA = zSign ? - a : a;
1234
    shiftCount = countLeadingZeros64( absA );
1235
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1236

    
1237
}
1238

    
1239
#endif
1240

    
1241
#ifdef FLOAT128
1242

    
1243
/*----------------------------------------------------------------------------
1244
| Returns the result of converting the 64-bit two's complement integer `a' to
1245
| the quadruple-precision floating-point format.  The conversion is performed
1246
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1247
*----------------------------------------------------------------------------*/
1248

    
1249
float128 int64_to_float128( int64 a STATUS_PARAM )
1250
{
1251
    flag zSign;
1252
    uint64 absA;
1253
    int8 shiftCount;
1254
    int32 zExp;
1255
    bits64 zSig0, zSig1;
1256

    
1257
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1258
    zSign = ( a < 0 );
1259
    absA = zSign ? - a : a;
1260
    shiftCount = countLeadingZeros64( absA ) + 49;
1261
    zExp = 0x406E - shiftCount;
1262
    if ( 64 <= shiftCount ) {
1263
        zSig1 = 0;
1264
        zSig0 = absA;
1265
        shiftCount -= 64;
1266
    }
1267
    else {
1268
        zSig1 = absA;
1269
        zSig0 = 0;
1270
    }
1271
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1272
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1273

    
1274
}
1275

    
1276
#endif
1277

    
1278
/*----------------------------------------------------------------------------
1279
| Returns the result of converting the single-precision floating-point value
1280
| `a' to the 32-bit two's complement integer format.  The conversion is
1281
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282
| Arithmetic---which means in particular that the conversion is rounded
1283
| according to the current rounding mode.  If `a' is a NaN, the largest
1284
| positive integer is returned.  Otherwise, if the conversion overflows, the
1285
| largest integer with the same sign as `a' is returned.
1286
*----------------------------------------------------------------------------*/
1287

    
1288
int32 float32_to_int32( float32 a STATUS_PARAM )
1289
{
1290
    flag aSign;
1291
    int16 aExp, shiftCount;
1292
    bits32 aSig;
1293
    bits64 aSig64;
1294

    
1295
    aSig = extractFloat32Frac( a );
1296
    aExp = extractFloat32Exp( a );
1297
    aSign = extractFloat32Sign( a );
1298
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1299
    if ( aExp ) aSig |= 0x00800000;
1300
    shiftCount = 0xAF - aExp;
1301
    aSig64 = aSig;
1302
    aSig64 <<= 32;
1303
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1304
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1305

    
1306
}
1307

    
1308
/*----------------------------------------------------------------------------
1309
| Returns the result of converting the single-precision floating-point value
1310
| `a' to the 32-bit two's complement integer format.  The conversion is
1311
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1312
| Arithmetic, except that the conversion is always rounded toward zero.
1313
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1314
| the conversion overflows, the largest integer with the same sign as `a' is
1315
| returned.
1316
*----------------------------------------------------------------------------*/
1317

    
1318
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1319
{
1320
    flag aSign;
1321
    int16 aExp, shiftCount;
1322
    bits32 aSig;
1323
    int32 z;
1324

    
1325
    aSig = extractFloat32Frac( a );
1326
    aExp = extractFloat32Exp( a );
1327
    aSign = extractFloat32Sign( a );
1328
    shiftCount = aExp - 0x9E;
1329
    if ( 0 <= shiftCount ) {
1330
        if ( float32_val(a) != 0xCF000000 ) {
1331
            float_raise( float_flag_invalid STATUS_VAR);
1332
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1333
        }
1334
        return (sbits32) 0x80000000;
1335
    }
1336
    else if ( aExp <= 0x7E ) {
1337
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1338
        return 0;
1339
    }
1340
    aSig = ( aSig | 0x00800000 )<<8;
1341
    z = aSig>>( - shiftCount );
1342
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1343
        STATUS(float_exception_flags) |= float_flag_inexact;
1344
    }
1345
    if ( aSign ) z = - z;
1346
    return z;
1347

    
1348
}
1349

    
1350
/*----------------------------------------------------------------------------
1351
| Returns the result of converting the single-precision floating-point value
1352
| `a' to the 64-bit two's complement integer format.  The conversion is
1353
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1354
| Arithmetic---which means in particular that the conversion is rounded
1355
| according to the current rounding mode.  If `a' is a NaN, the largest
1356
| positive integer is returned.  Otherwise, if the conversion overflows, the
1357
| largest integer with the same sign as `a' is returned.
1358
*----------------------------------------------------------------------------*/
1359

    
1360
int64 float32_to_int64( float32 a STATUS_PARAM )
1361
{
1362
    flag aSign;
1363
    int16 aExp, shiftCount;
1364
    bits32 aSig;
1365
    bits64 aSig64, aSigExtra;
1366

    
1367
    aSig = extractFloat32Frac( a );
1368
    aExp = extractFloat32Exp( a );
1369
    aSign = extractFloat32Sign( a );
1370
    shiftCount = 0xBE - aExp;
1371
    if ( shiftCount < 0 ) {
1372
        float_raise( float_flag_invalid STATUS_VAR);
1373
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1374
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1375
        }
1376
        return (sbits64) LIT64( 0x8000000000000000 );
1377
    }
1378
    if ( aExp ) aSig |= 0x00800000;
1379
    aSig64 = aSig;
1380
    aSig64 <<= 40;
1381
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1382
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1383

    
1384
}
1385

    
1386
/*----------------------------------------------------------------------------
1387
| Returns the result of converting the single-precision floating-point value
1388
| `a' to the 64-bit two's complement integer format.  The conversion is
1389
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1390
| Arithmetic, except that the conversion is always rounded toward zero.  If
1391
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1392
| conversion overflows, the largest integer with the same sign as `a' is
1393
| returned.
1394
*----------------------------------------------------------------------------*/
1395

    
1396
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1397
{
1398
    flag aSign;
1399
    int16 aExp, shiftCount;
1400
    bits32 aSig;
1401
    bits64 aSig64;
1402
    int64 z;
1403

    
1404
    aSig = extractFloat32Frac( a );
1405
    aExp = extractFloat32Exp( a );
1406
    aSign = extractFloat32Sign( a );
1407
    shiftCount = aExp - 0xBE;
1408
    if ( 0 <= shiftCount ) {
1409
        if ( float32_val(a) != 0xDF000000 ) {
1410
            float_raise( float_flag_invalid STATUS_VAR);
1411
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1412
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1413
            }
1414
        }
1415
        return (sbits64) LIT64( 0x8000000000000000 );
1416
    }
1417
    else if ( aExp <= 0x7E ) {
1418
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1419
        return 0;
1420
    }
1421
    aSig64 = aSig | 0x00800000;
1422
    aSig64 <<= 40;
1423
    z = aSig64>>( - shiftCount );
1424
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1425
        STATUS(float_exception_flags) |= float_flag_inexact;
1426
    }
1427
    if ( aSign ) z = - z;
1428
    return z;
1429

    
1430
}
1431

    
1432
/*----------------------------------------------------------------------------
1433
| Returns the result of converting the single-precision floating-point value
1434
| `a' to the double-precision floating-point format.  The conversion is
1435
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1436
| Arithmetic.
1437
*----------------------------------------------------------------------------*/
1438

    
1439
float64 float32_to_float64( float32 a STATUS_PARAM )
1440
{
1441
    flag aSign;
1442
    int16 aExp;
1443
    bits32 aSig;
1444

    
1445
    aSig = extractFloat32Frac( a );
1446
    aExp = extractFloat32Exp( a );
1447
    aSign = extractFloat32Sign( a );
1448
    if ( aExp == 0xFF ) {
1449
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1450
        return packFloat64( aSign, 0x7FF, 0 );
1451
    }
1452
    if ( aExp == 0 ) {
1453
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1454
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1455
        --aExp;
1456
    }
1457
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1458

    
1459
}
1460

    
1461
#ifdef FLOATX80
1462

    
1463
/*----------------------------------------------------------------------------
1464
| Returns the result of converting the single-precision floating-point value
1465
| `a' to the extended double-precision floating-point format.  The conversion
1466
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1467
| Arithmetic.
1468
*----------------------------------------------------------------------------*/
1469

    
1470
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1471
{
1472
    flag aSign;
1473
    int16 aExp;
1474
    bits32 aSig;
1475

    
1476
    aSig = extractFloat32Frac( a );
1477
    aExp = extractFloat32Exp( a );
1478
    aSign = extractFloat32Sign( a );
1479
    if ( aExp == 0xFF ) {
1480
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1481
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1482
    }
1483
    if ( aExp == 0 ) {
1484
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1485
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1486
    }
1487
    aSig |= 0x00800000;
1488
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1489

    
1490
}
1491

    
1492
#endif
1493

    
1494
#ifdef FLOAT128
1495

    
1496
/*----------------------------------------------------------------------------
1497
| Returns the result of converting the single-precision floating-point value
1498
| `a' to the double-precision floating-point format.  The conversion is
1499
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1500
| Arithmetic.
1501
*----------------------------------------------------------------------------*/
1502

    
1503
float128 float32_to_float128( float32 a STATUS_PARAM )
1504
{
1505
    flag aSign;
1506
    int16 aExp;
1507
    bits32 aSig;
1508

    
1509
    aSig = extractFloat32Frac( a );
1510
    aExp = extractFloat32Exp( a );
1511
    aSign = extractFloat32Sign( a );
1512
    if ( aExp == 0xFF ) {
1513
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1514
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1515
    }
1516
    if ( aExp == 0 ) {
1517
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1518
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1519
        --aExp;
1520
    }
1521
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1522

    
1523
}
1524

    
1525
#endif
1526

    
1527
/*----------------------------------------------------------------------------
1528
| Rounds the single-precision floating-point value `a' to an integer, and
1529
| returns the result as a single-precision floating-point value.  The
1530
| operation is performed according to the IEC/IEEE Standard for Binary
1531
| Floating-Point Arithmetic.
1532
*----------------------------------------------------------------------------*/
1533

    
1534
float32 float32_round_to_int( float32 a STATUS_PARAM)
1535
{
1536
    flag aSign;
1537
    int16 aExp;
1538
    bits32 lastBitMask, roundBitsMask;
1539
    int8 roundingMode;
1540
    bits32 z;
1541

    
1542
    aExp = extractFloat32Exp( a );
1543
    if ( 0x96 <= aExp ) {
1544
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1545
            return propagateFloat32NaN( a, a STATUS_VAR );
1546
        }
1547
        return a;
1548
    }
1549
    if ( aExp <= 0x7E ) {
1550
        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1551
        STATUS(float_exception_flags) |= float_flag_inexact;
1552
        aSign = extractFloat32Sign( a );
1553
        switch ( STATUS(float_rounding_mode) ) {
1554
         case float_round_nearest_even:
1555
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1556
                return packFloat32( aSign, 0x7F, 0 );
1557
            }
1558
            break;
1559
         case float_round_down:
1560
            return make_float32(aSign ? 0xBF800000 : 0);
1561
         case float_round_up:
1562
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1563
        }
1564
        return packFloat32( aSign, 0, 0 );
1565
    }
1566
    lastBitMask = 1;
1567
    lastBitMask <<= 0x96 - aExp;
1568
    roundBitsMask = lastBitMask - 1;
1569
    z = float32_val(a);
1570
    roundingMode = STATUS(float_rounding_mode);
1571
    if ( roundingMode == float_round_nearest_even ) {
1572
        z += lastBitMask>>1;
1573
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1574
    }
1575
    else if ( roundingMode != float_round_to_zero ) {
1576
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1577
            z += roundBitsMask;
1578
        }
1579
    }
1580
    z &= ~ roundBitsMask;
1581
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1582
    return make_float32(z);
1583

    
1584
}
1585

    
1586
/*----------------------------------------------------------------------------
1587
| Returns the result of adding the absolute values of the single-precision
1588
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1589
| before being returned.  `zSign' is ignored if the result is a NaN.
1590
| The addition is performed according to the IEC/IEEE Standard for Binary
1591
| Floating-Point Arithmetic.
1592
*----------------------------------------------------------------------------*/
1593

    
1594
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1595
{
1596
    int16 aExp, bExp, zExp;
1597
    bits32 aSig, bSig, zSig;
1598
    int16 expDiff;
1599

    
1600
    aSig = extractFloat32Frac( a );
1601
    aExp = extractFloat32Exp( a );
1602
    bSig = extractFloat32Frac( b );
1603
    bExp = extractFloat32Exp( b );
1604
    expDiff = aExp - bExp;
1605
    aSig <<= 6;
1606
    bSig <<= 6;
1607
    if ( 0 < expDiff ) {
1608
        if ( aExp == 0xFF ) {
1609
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1610
            return a;
1611
        }
1612
        if ( bExp == 0 ) {
1613
            --expDiff;
1614
        }
1615
        else {
1616
            bSig |= 0x20000000;
1617
        }
1618
        shift32RightJamming( bSig, expDiff, &bSig );
1619
        zExp = aExp;
1620
    }
1621
    else if ( expDiff < 0 ) {
1622
        if ( bExp == 0xFF ) {
1623
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1624
            return packFloat32( zSign, 0xFF, 0 );
1625
        }
1626
        if ( aExp == 0 ) {
1627
            ++expDiff;
1628
        }
1629
        else {
1630
            aSig |= 0x20000000;
1631
        }
1632
        shift32RightJamming( aSig, - expDiff, &aSig );
1633
        zExp = bExp;
1634
    }
1635
    else {
1636
        if ( aExp == 0xFF ) {
1637
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1638
            return a;
1639
        }
1640
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1641
        zSig = 0x40000000 + aSig + bSig;
1642
        zExp = aExp;
1643
        goto roundAndPack;
1644
    }
1645
    aSig |= 0x20000000;
1646
    zSig = ( aSig + bSig )<<1;
1647
    --zExp;
1648
    if ( (sbits32) zSig < 0 ) {
1649
        zSig = aSig + bSig;
1650
        ++zExp;
1651
    }
1652
 roundAndPack:
1653
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1654

    
1655
}
1656

    
1657
/*----------------------------------------------------------------------------
1658
| Returns the result of subtracting the absolute values of the single-
1659
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1660
| difference is negated before being returned.  `zSign' is ignored if the
1661
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1662
| Standard for Binary Floating-Point Arithmetic.
1663
*----------------------------------------------------------------------------*/
1664

    
1665
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1666
{
1667
    int16 aExp, bExp, zExp;
1668
    bits32 aSig, bSig, zSig;
1669
    int16 expDiff;
1670

    
1671
    aSig = extractFloat32Frac( a );
1672
    aExp = extractFloat32Exp( a );
1673
    bSig = extractFloat32Frac( b );
1674
    bExp = extractFloat32Exp( b );
1675
    expDiff = aExp - bExp;
1676
    aSig <<= 7;
1677
    bSig <<= 7;
1678
    if ( 0 < expDiff ) goto aExpBigger;
1679
    if ( expDiff < 0 ) goto bExpBigger;
1680
    if ( aExp == 0xFF ) {
1681
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1682
        float_raise( float_flag_invalid STATUS_VAR);
1683
        return float32_default_nan;
1684
    }
1685
    if ( aExp == 0 ) {
1686
        aExp = 1;
1687
        bExp = 1;
1688
    }
1689
    if ( bSig < aSig ) goto aBigger;
1690
    if ( aSig < bSig ) goto bBigger;
1691
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1692
 bExpBigger:
1693
    if ( bExp == 0xFF ) {
1694
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1695
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1696
    }
1697
    if ( aExp == 0 ) {
1698
        ++expDiff;
1699
    }
1700
    else {
1701
        aSig |= 0x40000000;
1702
    }
1703
    shift32RightJamming( aSig, - expDiff, &aSig );
1704
    bSig |= 0x40000000;
1705
 bBigger:
1706
    zSig = bSig - aSig;
1707
    zExp = bExp;
1708
    zSign ^= 1;
1709
    goto normalizeRoundAndPack;
1710
 aExpBigger:
1711
    if ( aExp == 0xFF ) {
1712
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1713
        return a;
1714
    }
1715
    if ( bExp == 0 ) {
1716
        --expDiff;
1717
    }
1718
    else {
1719
        bSig |= 0x40000000;
1720
    }
1721
    shift32RightJamming( bSig, expDiff, &bSig );
1722
    aSig |= 0x40000000;
1723
 aBigger:
1724
    zSig = aSig - bSig;
1725
    zExp = aExp;
1726
 normalizeRoundAndPack:
1727
    --zExp;
1728
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1729

    
1730
}
1731

    
1732
/*----------------------------------------------------------------------------
1733
| Returns the result of adding the single-precision floating-point values `a'
1734
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1735
| Binary Floating-Point Arithmetic.
1736
*----------------------------------------------------------------------------*/
1737

    
1738
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1739
{
1740
    flag aSign, bSign;
1741

    
1742
    aSign = extractFloat32Sign( a );
1743
    bSign = extractFloat32Sign( b );
1744
    if ( aSign == bSign ) {
1745
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1746
    }
1747
    else {
1748
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1749
    }
1750

    
1751
}
1752

    
1753
/*----------------------------------------------------------------------------
1754
| Returns the result of subtracting the single-precision floating-point values
1755
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1756
| for Binary Floating-Point Arithmetic.
1757
*----------------------------------------------------------------------------*/
1758

    
1759
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1760
{
1761
    flag aSign, bSign;
1762

    
1763
    aSign = extractFloat32Sign( a );
1764
    bSign = extractFloat32Sign( b );
1765
    if ( aSign == bSign ) {
1766
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1767
    }
1768
    else {
1769
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1770
    }
1771

    
1772
}
1773

    
1774
/*----------------------------------------------------------------------------
1775
| Returns the result of multiplying the single-precision floating-point values
1776
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1777
| for Binary Floating-Point Arithmetic.
1778
*----------------------------------------------------------------------------*/
1779

    
1780
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1781
{
1782
    flag aSign, bSign, zSign;
1783
    int16 aExp, bExp, zExp;
1784
    bits32 aSig, bSig;
1785
    bits64 zSig64;
1786
    bits32 zSig;
1787

    
1788
    aSig = extractFloat32Frac( a );
1789
    aExp = extractFloat32Exp( a );
1790
    aSign = extractFloat32Sign( a );
1791
    bSig = extractFloat32Frac( b );
1792
    bExp = extractFloat32Exp( b );
1793
    bSign = extractFloat32Sign( b );
1794
    zSign = aSign ^ bSign;
1795
    if ( aExp == 0xFF ) {
1796
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1797
            return propagateFloat32NaN( a, b STATUS_VAR );
1798
        }
1799
        if ( ( bExp | bSig ) == 0 ) {
1800
            float_raise( float_flag_invalid STATUS_VAR);
1801
            return float32_default_nan;
1802
        }
1803
        return packFloat32( zSign, 0xFF, 0 );
1804
    }
1805
    if ( bExp == 0xFF ) {
1806
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807
        if ( ( aExp | aSig ) == 0 ) {
1808
            float_raise( float_flag_invalid STATUS_VAR);
1809
            return float32_default_nan;
1810
        }
1811
        return packFloat32( zSign, 0xFF, 0 );
1812
    }
1813
    if ( aExp == 0 ) {
1814
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1815
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1816
    }
1817
    if ( bExp == 0 ) {
1818
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1819
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1820
    }
1821
    zExp = aExp + bExp - 0x7F;
1822
    aSig = ( aSig | 0x00800000 )<<7;
1823
    bSig = ( bSig | 0x00800000 )<<8;
1824
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1825
    zSig = zSig64;
1826
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1827
        zSig <<= 1;
1828
        --zExp;
1829
    }
1830
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1831

    
1832
}
1833

    
1834
/*----------------------------------------------------------------------------
1835
| Returns the result of dividing the single-precision floating-point value `a'
1836
| by the corresponding value `b'.  The operation is performed according to the
1837
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1838
*----------------------------------------------------------------------------*/
1839

    
1840
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1841
{
1842
    flag aSign, bSign, zSign;
1843
    int16 aExp, bExp, zExp;
1844
    bits32 aSig, bSig, zSig;
1845

    
1846
    aSig = extractFloat32Frac( a );
1847
    aExp = extractFloat32Exp( a );
1848
    aSign = extractFloat32Sign( a );
1849
    bSig = extractFloat32Frac( b );
1850
    bExp = extractFloat32Exp( b );
1851
    bSign = extractFloat32Sign( b );
1852
    zSign = aSign ^ bSign;
1853
    if ( aExp == 0xFF ) {
1854
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855
        if ( bExp == 0xFF ) {
1856
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1857
            float_raise( float_flag_invalid STATUS_VAR);
1858
            return float32_default_nan;
1859
        }
1860
        return packFloat32( zSign, 0xFF, 0 );
1861
    }
1862
    if ( bExp == 0xFF ) {
1863
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864
        return packFloat32( zSign, 0, 0 );
1865
    }
1866
    if ( bExp == 0 ) {
1867
        if ( bSig == 0 ) {
1868
            if ( ( aExp | aSig ) == 0 ) {
1869
                float_raise( float_flag_invalid STATUS_VAR);
1870
                return float32_default_nan;
1871
            }
1872
            float_raise( float_flag_divbyzero STATUS_VAR);
1873
            return packFloat32( zSign, 0xFF, 0 );
1874
        }
1875
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1876
    }
1877
    if ( aExp == 0 ) {
1878
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1879
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1880
    }
1881
    zExp = aExp - bExp + 0x7D;
1882
    aSig = ( aSig | 0x00800000 )<<7;
1883
    bSig = ( bSig | 0x00800000 )<<8;
1884
    if ( bSig <= ( aSig + aSig ) ) {
1885
        aSig >>= 1;
1886
        ++zExp;
1887
    }
1888
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1889
    if ( ( zSig & 0x3F ) == 0 ) {
1890
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1891
    }
1892
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1893

    
1894
}
1895

    
1896
/*----------------------------------------------------------------------------
1897
| Returns the remainder of the single-precision floating-point value `a'
1898
| with respect to the corresponding value `b'.  The operation is performed
1899
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1900
*----------------------------------------------------------------------------*/
1901

    
1902
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1903
{
1904
    flag aSign, bSign, zSign;
1905
    int16 aExp, bExp, expDiff;
1906
    bits32 aSig, bSig;
1907
    bits32 q;
1908
    bits64 aSig64, bSig64, q64;
1909
    bits32 alternateASig;
1910
    sbits32 sigMean;
1911

    
1912
    aSig = extractFloat32Frac( a );
1913
    aExp = extractFloat32Exp( a );
1914
    aSign = extractFloat32Sign( a );
1915
    bSig = extractFloat32Frac( b );
1916
    bExp = extractFloat32Exp( b );
1917
    bSign = extractFloat32Sign( b );
1918
    if ( aExp == 0xFF ) {
1919
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1920
            return propagateFloat32NaN( a, b STATUS_VAR );
1921
        }
1922
        float_raise( float_flag_invalid STATUS_VAR);
1923
        return float32_default_nan;
1924
    }
1925
    if ( bExp == 0xFF ) {
1926
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1927
        return a;
1928
    }
1929
    if ( bExp == 0 ) {
1930
        if ( bSig == 0 ) {
1931
            float_raise( float_flag_invalid STATUS_VAR);
1932
            return float32_default_nan;
1933
        }
1934
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1935
    }
1936
    if ( aExp == 0 ) {
1937
        if ( aSig == 0 ) return a;
1938
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1939
    }
1940
    expDiff = aExp - bExp;
1941
    aSig |= 0x00800000;
1942
    bSig |= 0x00800000;
1943
    if ( expDiff < 32 ) {
1944
        aSig <<= 8;
1945
        bSig <<= 8;
1946
        if ( expDiff < 0 ) {
1947
            if ( expDiff < -1 ) return a;
1948
            aSig >>= 1;
1949
        }
1950
        q = ( bSig <= aSig );
1951
        if ( q ) aSig -= bSig;
1952
        if ( 0 < expDiff ) {
1953
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1954
            q >>= 32 - expDiff;
1955
            bSig >>= 2;
1956
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1957
        }
1958
        else {
1959
            aSig >>= 2;
1960
            bSig >>= 2;
1961
        }
1962
    }
1963
    else {
1964
        if ( bSig <= aSig ) aSig -= bSig;
1965
        aSig64 = ( (bits64) aSig )<<40;
1966
        bSig64 = ( (bits64) bSig )<<40;
1967
        expDiff -= 64;
1968
        while ( 0 < expDiff ) {
1969
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1970
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1971
            aSig64 = - ( ( bSig * q64 )<<38 );
1972
            expDiff -= 62;
1973
        }
1974
        expDiff += 64;
1975
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1976
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1977
        q = q64>>( 64 - expDiff );
1978
        bSig <<= 6;
1979
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1980
    }
1981
    do {
1982
        alternateASig = aSig;
1983
        ++q;
1984
        aSig -= bSig;
1985
    } while ( 0 <= (sbits32) aSig );
1986
    sigMean = aSig + alternateASig;
1987
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1988
        aSig = alternateASig;
1989
    }
1990
    zSign = ( (sbits32) aSig < 0 );
1991
    if ( zSign ) aSig = - aSig;
1992
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1993

    
1994
}
1995

    
1996
/*----------------------------------------------------------------------------
1997
| Returns the square root of the single-precision floating-point value `a'.
1998
| The operation is performed according to the IEC/IEEE Standard for Binary
1999
| Floating-Point Arithmetic.
2000
*----------------------------------------------------------------------------*/
2001

    
2002
float32 float32_sqrt( float32 a STATUS_PARAM )
2003
{
2004
    flag aSign;
2005
    int16 aExp, zExp;
2006
    bits32 aSig, zSig;
2007
    bits64 rem, term;
2008

    
2009
    aSig = extractFloat32Frac( a );
2010
    aExp = extractFloat32Exp( a );
2011
    aSign = extractFloat32Sign( a );
2012
    if ( aExp == 0xFF ) {
2013
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2014
        if ( ! aSign ) return a;
2015
        float_raise( float_flag_invalid STATUS_VAR);
2016
        return float32_default_nan;
2017
    }
2018
    if ( aSign ) {
2019
        if ( ( aExp | aSig ) == 0 ) return a;
2020
        float_raise( float_flag_invalid STATUS_VAR);
2021
        return float32_default_nan;
2022
    }
2023
    if ( aExp == 0 ) {
2024
        if ( aSig == 0 ) return float32_zero;
2025
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2026
    }
2027
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2028
    aSig = ( aSig | 0x00800000 )<<8;
2029
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2030
    if ( ( zSig & 0x7F ) <= 5 ) {
2031
        if ( zSig < 2 ) {
2032
            zSig = 0x7FFFFFFF;
2033
            goto roundAndPack;
2034
        }
2035
        aSig >>= aExp & 1;
2036
        term = ( (bits64) zSig ) * zSig;
2037
        rem = ( ( (bits64) aSig )<<32 ) - term;
2038
        while ( (sbits64) rem < 0 ) {
2039
            --zSig;
2040
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2041
        }
2042
        zSig |= ( rem != 0 );
2043
    }
2044
    shift32RightJamming( zSig, 1, &zSig );
2045
 roundAndPack:
2046
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2047

    
2048
}
2049

    
2050
/*----------------------------------------------------------------------------
2051
| Returns 1 if the single-precision floating-point value `a' is equal to
2052
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2053
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2054
*----------------------------------------------------------------------------*/
2055

    
2056
int float32_eq( float32 a, float32 b STATUS_PARAM )
2057
{
2058

    
2059
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2060
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2061
       ) {
2062
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2063
            float_raise( float_flag_invalid STATUS_VAR);
2064
        }
2065
        return 0;
2066
    }
2067
    return ( float32_val(a) == float32_val(b) ) ||
2068
            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2069

    
2070
}
2071

    
2072
/*----------------------------------------------------------------------------
2073
| Returns 1 if the single-precision floating-point value `a' is less than
2074
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2075
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2076
| Arithmetic.
2077
*----------------------------------------------------------------------------*/
2078

    
2079
int float32_le( float32 a, float32 b STATUS_PARAM )
2080
{
2081
    flag aSign, bSign;
2082
    bits32 av, bv;
2083

    
2084
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2085
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2086
       ) {
2087
        float_raise( float_flag_invalid STATUS_VAR);
2088
        return 0;
2089
    }
2090
    aSign = extractFloat32Sign( a );
2091
    bSign = extractFloat32Sign( b );
2092
    av = float32_val(a);
2093
    bv = float32_val(b);
2094
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2095
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2096

    
2097
}
2098

    
2099
/*----------------------------------------------------------------------------
2100
| Returns 1 if the single-precision floating-point value `a' is less than
2101
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2102
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2103
*----------------------------------------------------------------------------*/
2104

    
2105
int float32_lt( float32 a, float32 b STATUS_PARAM )
2106
{
2107
    flag aSign, bSign;
2108
    bits32 av, bv;
2109

    
2110
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2111
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2112
       ) {
2113
        float_raise( float_flag_invalid STATUS_VAR);
2114
        return 0;
2115
    }
2116
    aSign = extractFloat32Sign( a );
2117
    bSign = extractFloat32Sign( b );
2118
    av = float32_val(a);
2119
    bv = float32_val(b);
2120
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2121
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2122

    
2123
}
2124

    
2125
/*----------------------------------------------------------------------------
2126
| Returns 1 if the single-precision floating-point value `a' is equal to
2127
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2128
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2129
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2130
*----------------------------------------------------------------------------*/
2131

    
2132
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2133
{
2134
    bits32 av, bv;
2135

    
2136
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2137
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2138
       ) {
2139
        float_raise( float_flag_invalid STATUS_VAR);
2140
        return 0;
2141
    }
2142
    av = float32_val(a);
2143
    bv = float32_val(b);
2144
    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2145

    
2146
}
2147

    
2148
/*----------------------------------------------------------------------------
2149
| Returns 1 if the single-precision floating-point value `a' is less than or
2150
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2151
| cause an exception.  Otherwise, the comparison is performed according to the
2152
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2153
*----------------------------------------------------------------------------*/
2154

    
2155
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2156
{
2157
    flag aSign, bSign;
2158
    bits32 av, bv;
2159

    
2160
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2161
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2162
       ) {
2163
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2164
            float_raise( float_flag_invalid STATUS_VAR);
2165
        }
2166
        return 0;
2167
    }
2168
    aSign = extractFloat32Sign( a );
2169
    bSign = extractFloat32Sign( b );
2170
    av = float32_val(a);
2171
    bv = float32_val(b);
2172
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2173
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2174

    
2175
}
2176

    
2177
/*----------------------------------------------------------------------------
2178
| Returns 1 if the single-precision floating-point value `a' is less than
2179
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2180
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2181
| Standard for Binary Floating-Point Arithmetic.
2182
*----------------------------------------------------------------------------*/
2183

    
2184
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2185
{
2186
    flag aSign, bSign;
2187
    bits32 av, bv;
2188

    
2189
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2190
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2191
       ) {
2192
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2193
            float_raise( float_flag_invalid STATUS_VAR);
2194
        }
2195
        return 0;
2196
    }
2197
    aSign = extractFloat32Sign( a );
2198
    bSign = extractFloat32Sign( b );
2199
    av = float32_val(a);
2200
    bv = float32_val(b);
2201
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2202
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2203

    
2204
}
2205

    
2206
/*----------------------------------------------------------------------------
2207
| Returns the result of converting the double-precision floating-point value
2208
| `a' to the 32-bit two's complement integer format.  The conversion is
2209
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2210
| Arithmetic---which means in particular that the conversion is rounded
2211
| according to the current rounding mode.  If `a' is a NaN, the largest
2212
| positive integer is returned.  Otherwise, if the conversion overflows, the
2213
| largest integer with the same sign as `a' is returned.
2214
*----------------------------------------------------------------------------*/
2215

    
2216
int32 float64_to_int32( float64 a STATUS_PARAM )
2217
{
2218
    flag aSign;
2219
    int16 aExp, shiftCount;
2220
    bits64 aSig;
2221

    
2222
    aSig = extractFloat64Frac( a );
2223
    aExp = extractFloat64Exp( a );
2224
    aSign = extractFloat64Sign( a );
2225
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2226
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2227
    shiftCount = 0x42C - aExp;
2228
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2229
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2230

    
2231
}
2232

    
2233
/*----------------------------------------------------------------------------
2234
| Returns the result of converting the double-precision floating-point value
2235
| `a' to the 32-bit two's complement integer format.  The conversion is
2236
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237
| Arithmetic, except that the conversion is always rounded toward zero.
2238
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2239
| the conversion overflows, the largest integer with the same sign as `a' is
2240
| returned.
2241
*----------------------------------------------------------------------------*/
2242

    
2243
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2244
{
2245
    flag aSign;
2246
    int16 aExp, shiftCount;
2247
    bits64 aSig, savedASig;
2248
    int32 z;
2249

    
2250
    aSig = extractFloat64Frac( a );
2251
    aExp = extractFloat64Exp( a );
2252
    aSign = extractFloat64Sign( a );
2253
    if ( 0x41E < aExp ) {
2254
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2255
        goto invalid;
2256
    }
2257
    else if ( aExp < 0x3FF ) {
2258
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2259
        return 0;
2260
    }
2261
    aSig |= LIT64( 0x0010000000000000 );
2262
    shiftCount = 0x433 - aExp;
2263
    savedASig = aSig;
2264
    aSig >>= shiftCount;
2265
    z = aSig;
2266
    if ( aSign ) z = - z;
2267
    if ( ( z < 0 ) ^ aSign ) {
2268
 invalid:
2269
        float_raise( float_flag_invalid STATUS_VAR);
2270
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2271
    }
2272
    if ( ( aSig<<shiftCount ) != savedASig ) {
2273
        STATUS(float_exception_flags) |= float_flag_inexact;
2274
    }
2275
    return z;
2276

    
2277
}
2278

    
2279
/*----------------------------------------------------------------------------
2280
| Returns the result of converting the double-precision floating-point value
2281
| `a' to the 64-bit two's complement integer format.  The conversion is
2282
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2283
| Arithmetic---which means in particular that the conversion is rounded
2284
| according to the current rounding mode.  If `a' is a NaN, the largest
2285
| positive integer is returned.  Otherwise, if the conversion overflows, the
2286
| largest integer with the same sign as `a' is returned.
2287
*----------------------------------------------------------------------------*/
2288

    
2289
int64 float64_to_int64( float64 a STATUS_PARAM )
2290
{
2291
    flag aSign;
2292
    int16 aExp, shiftCount;
2293
    bits64 aSig, aSigExtra;
2294

    
2295
    aSig = extractFloat64Frac( a );
2296
    aExp = extractFloat64Exp( a );
2297
    aSign = extractFloat64Sign( a );
2298
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2299
    shiftCount = 0x433 - aExp;
2300
    if ( shiftCount <= 0 ) {
2301
        if ( 0x43E < aExp ) {
2302
            float_raise( float_flag_invalid STATUS_VAR);
2303
            if (    ! aSign
2304
                 || (    ( aExp == 0x7FF )
2305
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2306
               ) {
2307
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2308
            }
2309
            return (sbits64) LIT64( 0x8000000000000000 );
2310
        }
2311
        aSigExtra = 0;
2312
        aSig <<= - shiftCount;
2313
    }
2314
    else {
2315
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2316
    }
2317
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2318

    
2319
}
2320

    
2321
/*----------------------------------------------------------------------------
2322
| Returns the result of converting the double-precision floating-point value
2323
| `a' to the 64-bit two's complement integer format.  The conversion is
2324
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2325
| Arithmetic, except that the conversion is always rounded toward zero.
2326
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2327
| the conversion overflows, the largest integer with the same sign as `a' is
2328
| returned.
2329
*----------------------------------------------------------------------------*/
2330

    
2331
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2332
{
2333
    flag aSign;
2334
    int16 aExp, shiftCount;
2335
    bits64 aSig;
2336
    int64 z;
2337

    
2338
    aSig = extractFloat64Frac( a );
2339
    aExp = extractFloat64Exp( a );
2340
    aSign = extractFloat64Sign( a );
2341
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2342
    shiftCount = aExp - 0x433;
2343
    if ( 0 <= shiftCount ) {
2344
        if ( 0x43E <= aExp ) {
2345
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2346
                float_raise( float_flag_invalid STATUS_VAR);
2347
                if (    ! aSign
2348
                     || (    ( aExp == 0x7FF )
2349
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2350
                   ) {
2351
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2352
                }
2353
            }
2354
            return (sbits64) LIT64( 0x8000000000000000 );
2355
        }
2356
        z = aSig<<shiftCount;
2357
    }
2358
    else {
2359
        if ( aExp < 0x3FE ) {
2360
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2361
            return 0;
2362
        }
2363
        z = aSig>>( - shiftCount );
2364
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2365
            STATUS(float_exception_flags) |= float_flag_inexact;
2366
        }
2367
    }
2368
    if ( aSign ) z = - z;
2369
    return z;
2370

    
2371
}
2372

    
2373
/*----------------------------------------------------------------------------
2374
| Returns the result of converting the double-precision floating-point value
2375
| `a' to the single-precision floating-point format.  The conversion is
2376
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2377
| Arithmetic.
2378
*----------------------------------------------------------------------------*/
2379

    
2380
float32 float64_to_float32( float64 a STATUS_PARAM )
2381
{
2382
    flag aSign;
2383
    int16 aExp;
2384
    bits64 aSig;
2385
    bits32 zSig;
2386

    
2387
    aSig = extractFloat64Frac( a );
2388
    aExp = extractFloat64Exp( a );
2389
    aSign = extractFloat64Sign( a );
2390
    if ( aExp == 0x7FF ) {
2391
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2392
        return packFloat32( aSign, 0xFF, 0 );
2393
    }
2394
    shift64RightJamming( aSig, 22, &aSig );
2395
    zSig = aSig;
2396
    if ( aExp || zSig ) {
2397
        zSig |= 0x40000000;
2398
        aExp -= 0x381;
2399
    }
2400
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2401

    
2402
}
2403

    
2404
#ifdef FLOATX80
2405

    
2406
/*----------------------------------------------------------------------------
2407
| Returns the result of converting the double-precision floating-point value
2408
| `a' to the extended double-precision floating-point format.  The conversion
2409
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2410
| Arithmetic.
2411
*----------------------------------------------------------------------------*/
2412

    
2413
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2414
{
2415
    flag aSign;
2416
    int16 aExp;
2417
    bits64 aSig;
2418

    
2419
    aSig = extractFloat64Frac( a );
2420
    aExp = extractFloat64Exp( a );
2421
    aSign = extractFloat64Sign( a );
2422
    if ( aExp == 0x7FF ) {
2423
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2424
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2425
    }
2426
    if ( aExp == 0 ) {
2427
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2428
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2429
    }
2430
    return
2431
        packFloatx80(
2432
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2433

    
2434
}
2435

    
2436
#endif
2437

    
2438
#ifdef FLOAT128
2439

    
2440
/*----------------------------------------------------------------------------
2441
| Returns the result of converting the double-precision floating-point value
2442
| `a' to the quadruple-precision floating-point format.  The conversion is
2443
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2444
| Arithmetic.
2445
*----------------------------------------------------------------------------*/
2446

    
2447
float128 float64_to_float128( float64 a STATUS_PARAM )
2448
{
2449
    flag aSign;
2450
    int16 aExp;
2451
    bits64 aSig, zSig0, zSig1;
2452

    
2453
    aSig = extractFloat64Frac( a );
2454
    aExp = extractFloat64Exp( a );
2455
    aSign = extractFloat64Sign( a );
2456
    if ( aExp == 0x7FF ) {
2457
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2458
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2459
    }
2460
    if ( aExp == 0 ) {
2461
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2462
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2463
        --aExp;
2464
    }
2465
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2466
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2467

    
2468
}
2469

    
2470
#endif
2471

    
2472
/*----------------------------------------------------------------------------
2473
| Rounds the double-precision floating-point value `a' to an integer, and
2474
| returns the result as a double-precision floating-point value.  The
2475
| operation is performed according to the IEC/IEEE Standard for Binary
2476
| Floating-Point Arithmetic.
2477
*----------------------------------------------------------------------------*/
2478

    
2479
float64 float64_round_to_int( float64 a STATUS_PARAM )
2480
{
2481
    flag aSign;
2482
    int16 aExp;
2483
    bits64 lastBitMask, roundBitsMask;
2484
    int8 roundingMode;
2485
    bits64 z;
2486

    
2487
    aExp = extractFloat64Exp( a );
2488
    if ( 0x433 <= aExp ) {
2489
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2490
            return propagateFloat64NaN( a, a STATUS_VAR );
2491
        }
2492
        return a;
2493
    }
2494
    if ( aExp < 0x3FF ) {
2495
        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2496
        STATUS(float_exception_flags) |= float_flag_inexact;
2497
        aSign = extractFloat64Sign( a );
2498
        switch ( STATUS(float_rounding_mode) ) {
2499
         case float_round_nearest_even:
2500
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2501
                return packFloat64( aSign, 0x3FF, 0 );
2502
            }
2503
            break;
2504
         case float_round_down:
2505
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2506
         case float_round_up:
2507
            return make_float64(
2508
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2509
        }
2510
        return packFloat64( aSign, 0, 0 );
2511
    }
2512
    lastBitMask = 1;
2513
    lastBitMask <<= 0x433 - aExp;
2514
    roundBitsMask = lastBitMask - 1;
2515
    z = float64_val(a);
2516
    roundingMode = STATUS(float_rounding_mode);
2517
    if ( roundingMode == float_round_nearest_even ) {
2518
        z += lastBitMask>>1;
2519
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2520
    }
2521
    else if ( roundingMode != float_round_to_zero ) {
2522
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2523
            z += roundBitsMask;
2524
        }
2525
    }
2526
    z &= ~ roundBitsMask;
2527
    if ( z != float64_val(a) )
2528
        STATUS(float_exception_flags) |= float_flag_inexact;
2529
    return make_float64(z);
2530

    
2531
}
2532

    
2533
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2534
{
2535
    int oldmode;
2536
    float64 res;
2537
    oldmode = STATUS(float_rounding_mode);
2538
    STATUS(float_rounding_mode) = float_round_to_zero;
2539
    res = float64_round_to_int(a STATUS_VAR);
2540
    STATUS(float_rounding_mode) = oldmode;
2541
    return res;
2542
}
2543

    
2544
/*----------------------------------------------------------------------------
2545
| Returns the result of adding the absolute values of the double-precision
2546
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2547
| before being returned.  `zSign' is ignored if the result is a NaN.
2548
| The addition is performed according to the IEC/IEEE Standard for Binary
2549
| Floating-Point Arithmetic.
2550
*----------------------------------------------------------------------------*/
2551

    
2552
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2553
{
2554
    int16 aExp, bExp, zExp;
2555
    bits64 aSig, bSig, zSig;
2556
    int16 expDiff;
2557

    
2558
    aSig = extractFloat64Frac( a );
2559
    aExp = extractFloat64Exp( a );
2560
    bSig = extractFloat64Frac( b );
2561
    bExp = extractFloat64Exp( b );
2562
    expDiff = aExp - bExp;
2563
    aSig <<= 9;
2564
    bSig <<= 9;
2565
    if ( 0 < expDiff ) {
2566
        if ( aExp == 0x7FF ) {
2567
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2568
            return a;
2569
        }
2570
        if ( bExp == 0 ) {
2571
            --expDiff;
2572
        }
2573
        else {
2574
            bSig |= LIT64( 0x2000000000000000 );
2575
        }
2576
        shift64RightJamming( bSig, expDiff, &bSig );
2577
        zExp = aExp;
2578
    }
2579
    else if ( expDiff < 0 ) {
2580
        if ( bExp == 0x7FF ) {
2581
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582
            return packFloat64( zSign, 0x7FF, 0 );
2583
        }
2584
        if ( aExp == 0 ) {
2585
            ++expDiff;
2586
        }
2587
        else {
2588
            aSig |= LIT64( 0x2000000000000000 );
2589
        }
2590
        shift64RightJamming( aSig, - expDiff, &aSig );
2591
        zExp = bExp;
2592
    }
2593
    else {
2594
        if ( aExp == 0x7FF ) {
2595
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2596
            return a;
2597
        }
2598
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2599
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2600
        zExp = aExp;
2601
        goto roundAndPack;
2602
    }
2603
    aSig |= LIT64( 0x2000000000000000 );
2604
    zSig = ( aSig + bSig )<<1;
2605
    --zExp;
2606
    if ( (sbits64) zSig < 0 ) {
2607
        zSig = aSig + bSig;
2608
        ++zExp;
2609
    }
2610
 roundAndPack:
2611
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2612

    
2613
}
2614

    
2615
/*----------------------------------------------------------------------------
2616
| Returns the result of subtracting the absolute values of the double-
2617
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2618
| difference is negated before being returned.  `zSign' is ignored if the
2619
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2620
| Standard for Binary Floating-Point Arithmetic.
2621
*----------------------------------------------------------------------------*/
2622

    
2623
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2624
{
2625
    int16 aExp, bExp, zExp;
2626
    bits64 aSig, bSig, zSig;
2627
    int16 expDiff;
2628

    
2629
    aSig = extractFloat64Frac( a );
2630
    aExp = extractFloat64Exp( a );
2631
    bSig = extractFloat64Frac( b );
2632
    bExp = extractFloat64Exp( b );
2633
    expDiff = aExp - bExp;
2634
    aSig <<= 10;
2635
    bSig <<= 10;
2636
    if ( 0 < expDiff ) goto aExpBigger;
2637
    if ( expDiff < 0 ) goto bExpBigger;
2638
    if ( aExp == 0x7FF ) {
2639
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2640
        float_raise( float_flag_invalid STATUS_VAR);
2641
        return float64_default_nan;
2642
    }
2643
    if ( aExp == 0 ) {
2644
        aExp = 1;
2645
        bExp = 1;
2646
    }
2647
    if ( bSig < aSig ) goto aBigger;
2648
    if ( aSig < bSig ) goto bBigger;
2649
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2650
 bExpBigger:
2651
    if ( bExp == 0x7FF ) {
2652
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2653
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2654
    }
2655
    if ( aExp == 0 ) {
2656
        ++expDiff;
2657
    }
2658
    else {
2659
        aSig |= LIT64( 0x4000000000000000 );
2660
    }
2661
    shift64RightJamming( aSig, - expDiff, &aSig );
2662
    bSig |= LIT64( 0x4000000000000000 );
2663
 bBigger:
2664
    zSig = bSig - aSig;
2665
    zExp = bExp;
2666
    zSign ^= 1;
2667
    goto normalizeRoundAndPack;
2668
 aExpBigger:
2669
    if ( aExp == 0x7FF ) {
2670
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2671
        return a;
2672
    }
2673
    if ( bExp == 0 ) {
2674
        --expDiff;
2675
    }
2676
    else {
2677
        bSig |= LIT64( 0x4000000000000000 );
2678
    }
2679
    shift64RightJamming( bSig, expDiff, &bSig );
2680
    aSig |= LIT64( 0x4000000000000000 );
2681
 aBigger:
2682
    zSig = aSig - bSig;
2683
    zExp = aExp;
2684
 normalizeRoundAndPack:
2685
    --zExp;
2686
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2687

    
2688
}
2689

    
2690
/*----------------------------------------------------------------------------
2691
| Returns the result of adding the double-precision floating-point values `a'
2692
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2693
| Binary Floating-Point Arithmetic.
2694
*----------------------------------------------------------------------------*/
2695

    
2696
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2697
{
2698
    flag aSign, bSign;
2699

    
2700
    aSign = extractFloat64Sign( a );
2701
    bSign = extractFloat64Sign( b );
2702
    if ( aSign == bSign ) {
2703
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2704
    }
2705
    else {
2706
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2707
    }
2708

    
2709
}
2710

    
2711
/*----------------------------------------------------------------------------
2712
| Returns the result of subtracting the double-precision floating-point values
2713
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2714
| for Binary Floating-Point Arithmetic.
2715
*----------------------------------------------------------------------------*/
2716

    
2717
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2718
{
2719
    flag aSign, bSign;
2720

    
2721
    aSign = extractFloat64Sign( a );
2722
    bSign = extractFloat64Sign( b );
2723
    if ( aSign == bSign ) {
2724
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2725
    }
2726
    else {
2727
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2728
    }
2729

    
2730
}
2731

    
2732
/*----------------------------------------------------------------------------
2733
| Returns the result of multiplying the double-precision floating-point values
2734
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2735
| for Binary Floating-Point Arithmetic.
2736
*----------------------------------------------------------------------------*/
2737

    
2738
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2739
{
2740
    flag aSign, bSign, zSign;
2741
    int16 aExp, bExp, zExp;
2742
    bits64 aSig, bSig, zSig0, zSig1;
2743

    
2744
    aSig = extractFloat64Frac( a );
2745
    aExp = extractFloat64Exp( a );
2746
    aSign = extractFloat64Sign( a );
2747
    bSig = extractFloat64Frac( b );
2748
    bExp = extractFloat64Exp( b );
2749
    bSign = extractFloat64Sign( b );
2750
    zSign = aSign ^ bSign;
2751
    if ( aExp == 0x7FF ) {
2752
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2753
            return propagateFloat64NaN( a, b STATUS_VAR );
2754
        }
2755
        if ( ( bExp | bSig ) == 0 ) {
2756
            float_raise( float_flag_invalid STATUS_VAR);
2757
            return float64_default_nan;
2758
        }
2759
        return packFloat64( zSign, 0x7FF, 0 );
2760
    }
2761
    if ( bExp == 0x7FF ) {
2762
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2763
        if ( ( aExp | aSig ) == 0 ) {
2764
            float_raise( float_flag_invalid STATUS_VAR);
2765
            return float64_default_nan;
2766
        }
2767
        return packFloat64( zSign, 0x7FF, 0 );
2768
    }
2769
    if ( aExp == 0 ) {
2770
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2771
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2772
    }
2773
    if ( bExp == 0 ) {
2774
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2775
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776
    }
2777
    zExp = aExp + bExp - 0x3FF;
2778
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2781
    zSig0 |= ( zSig1 != 0 );
2782
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2783
        zSig0 <<= 1;
2784
        --zExp;
2785
    }
2786
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2787

    
2788
}
2789

    
2790
/*----------------------------------------------------------------------------
2791
| Returns the result of dividing the double-precision floating-point value `a'
2792
| by the corresponding value `b'.  The operation is performed according to
2793
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2794
*----------------------------------------------------------------------------*/
2795

    
2796
float64 float64_div( float64 a, float64 b STATUS_PARAM )
2797
{
2798
    flag aSign, bSign, zSign;
2799
    int16 aExp, bExp, zExp;
2800
    bits64 aSig, bSig, zSig;
2801
    bits64 rem0, rem1;
2802
    bits64 term0, term1;
2803

    
2804
    aSig = extractFloat64Frac( a );
2805
    aExp = extractFloat64Exp( a );
2806
    aSign = extractFloat64Sign( a );
2807
    bSig = extractFloat64Frac( b );
2808
    bExp = extractFloat64Exp( b );
2809
    bSign = extractFloat64Sign( b );
2810
    zSign = aSign ^ bSign;
2811
    if ( aExp == 0x7FF ) {
2812
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2813
        if ( bExp == 0x7FF ) {
2814
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2815
            float_raise( float_flag_invalid STATUS_VAR);
2816
            return float64_default_nan;
2817
        }
2818
        return packFloat64( zSign, 0x7FF, 0 );
2819
    }
2820
    if ( bExp == 0x7FF ) {
2821
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2822
        return packFloat64( zSign, 0, 0 );
2823
    }
2824
    if ( bExp == 0 ) {
2825
        if ( bSig == 0 ) {
2826
            if ( ( aExp | aSig ) == 0 ) {
2827
                float_raise( float_flag_invalid STATUS_VAR);
2828
                return float64_default_nan;
2829
            }
2830
            float_raise( float_flag_divbyzero STATUS_VAR);
2831
            return packFloat64( zSign, 0x7FF, 0 );
2832
        }
2833
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2834
    }
2835
    if ( aExp == 0 ) {
2836
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2837
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2838
    }
2839
    zExp = aExp - bExp + 0x3FD;
2840
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2841
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2842
    if ( bSig <= ( aSig + aSig ) ) {
2843
        aSig >>= 1;
2844
        ++zExp;
2845
    }
2846
    zSig = estimateDiv128To64( aSig, 0, bSig );
2847
    if ( ( zSig & 0x1FF ) <= 2 ) {
2848
        mul64To128( bSig, zSig, &term0, &term1 );
2849
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2850
        while ( (sbits64) rem0 < 0 ) {
2851
            --zSig;
2852
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2853
        }
2854
        zSig |= ( rem1 != 0 );
2855
    }
2856
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2857

    
2858
}
2859

    
2860
/*----------------------------------------------------------------------------
2861
| Returns the remainder of the double-precision floating-point value `a'
2862
| with respect to the corresponding value `b'.  The operation is performed
2863
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2864
*----------------------------------------------------------------------------*/
2865

    
2866
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2867
{
2868
    flag aSign, bSign, zSign;
2869
    int16 aExp, bExp, expDiff;
2870
    bits64 aSig, bSig;
2871
    bits64 q, alternateASig;
2872
    sbits64 sigMean;
2873

    
2874
    aSig = extractFloat64Frac( a );
2875
    aExp = extractFloat64Exp( a );
2876
    aSign = extractFloat64Sign( a );
2877
    bSig = extractFloat64Frac( b );
2878
    bExp = extractFloat64Exp( b );
2879
    bSign = extractFloat64Sign( b );
2880
    if ( aExp == 0x7FF ) {
2881
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2882
            return propagateFloat64NaN( a, b STATUS_VAR );
2883
        }
2884
        float_raise( float_flag_invalid STATUS_VAR);
2885
        return float64_default_nan;
2886
    }
2887
    if ( bExp == 0x7FF ) {
2888
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2889
        return a;
2890
    }
2891
    if ( bExp == 0 ) {
2892
        if ( bSig == 0 ) {
2893
            float_raise( float_flag_invalid STATUS_VAR);
2894
            return float64_default_nan;
2895
        }
2896
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2897
    }
2898
    if ( aExp == 0 ) {
2899
        if ( aSig == 0 ) return a;
2900
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2901
    }
2902
    expDiff = aExp - bExp;
2903
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2904
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2905
    if ( expDiff < 0 ) {
2906
        if ( expDiff < -1 ) return a;
2907
        aSig >>= 1;
2908
    }
2909
    q = ( bSig <= aSig );
2910
    if ( q ) aSig -= bSig;
2911
    expDiff -= 64;
2912
    while ( 0 < expDiff ) {
2913
        q = estimateDiv128To64( aSig, 0, bSig );
2914
        q = ( 2 < q ) ? q - 2 : 0;
2915
        aSig = - ( ( bSig>>2 ) * q );
2916
        expDiff -= 62;
2917
    }
2918
    expDiff += 64;
2919
    if ( 0 < expDiff ) {
2920
        q = estimateDiv128To64( aSig, 0, bSig );
2921
        q = ( 2 < q ) ? q - 2 : 0;
2922
        q >>= 64 - expDiff;
2923
        bSig >>= 2;
2924
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2925
    }
2926
    else {
2927
        aSig >>= 2;
2928
        bSig >>= 2;
2929
    }
2930
    do {
2931
        alternateASig = aSig;
2932
        ++q;
2933
        aSig -= bSig;
2934
    } while ( 0 <= (sbits64) aSig );
2935
    sigMean = aSig + alternateASig;
2936
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2937
        aSig = alternateASig;
2938
    }
2939
    zSign = ( (sbits64) aSig < 0 );
2940
    if ( zSign ) aSig = - aSig;
2941
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2942

    
2943
}
2944

    
2945
/*----------------------------------------------------------------------------
2946
| Returns the square root of the double-precision floating-point value `a'.
2947
| The operation is performed according to the IEC/IEEE Standard for Binary
2948
| Floating-Point Arithmetic.
2949
*----------------------------------------------------------------------------*/
2950

    
2951
float64 float64_sqrt( float64 a STATUS_PARAM )
2952
{
2953
    flag aSign;
2954
    int16 aExp, zExp;
2955
    bits64 aSig, zSig, doubleZSig;
2956
    bits64 rem0, rem1, term0, term1;
2957

    
2958
    aSig = extractFloat64Frac( a );
2959
    aExp = extractFloat64Exp( a );
2960
    aSign = extractFloat64Sign( a );
2961
    if ( aExp == 0x7FF ) {
2962
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2963
        if ( ! aSign ) return a;
2964
        float_raise( float_flag_invalid STATUS_VAR);
2965
        return float64_default_nan;
2966
    }
2967
    if ( aSign ) {
2968
        if ( ( aExp | aSig ) == 0 ) return a;
2969
        float_raise( float_flag_invalid STATUS_VAR);
2970
        return float64_default_nan;
2971
    }
2972
    if ( aExp == 0 ) {
2973
        if ( aSig == 0 ) return float64_zero;
2974
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2975
    }
2976
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2977
    aSig |= LIT64( 0x0010000000000000 );
2978
    zSig = estimateSqrt32( aExp, aSig>>21 );
2979
    aSig <<= 9 - ( aExp & 1 );
2980
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2981
    if ( ( zSig & 0x1FF ) <= 5 ) {
2982
        doubleZSig = zSig<<1;
2983
        mul64To128( zSig, zSig, &term0, &term1 );
2984
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2985
        while ( (sbits64) rem0 < 0 ) {
2986
            --zSig;
2987
            doubleZSig -= 2;
2988
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2989
        }
2990
        zSig |= ( ( rem0 | rem1 ) != 0 );
2991
    }
2992
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2993

    
2994
}
2995

    
2996
/*----------------------------------------------------------------------------
2997
| Returns 1 if the double-precision floating-point value `a' is equal to the
2998
| corresponding value `b', and 0 otherwise.  The comparison is performed
2999
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3000
*----------------------------------------------------------------------------*/
3001

    
3002
int float64_eq( float64 a, float64 b STATUS_PARAM )
3003
{
3004
    bits64 av, bv;
3005

    
3006
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3007
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3008
       ) {
3009
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3010
            float_raise( float_flag_invalid STATUS_VAR);
3011
        }
3012
        return 0;
3013
    }
3014
    av = float64_val(a);
3015
    bv = float64_val(b);
3016
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3017

    
3018
}
3019

    
3020
/*----------------------------------------------------------------------------
3021
| Returns 1 if the double-precision floating-point value `a' is less than or
3022
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3023
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3024
| Arithmetic.
3025
*----------------------------------------------------------------------------*/
3026

    
3027
int float64_le( float64 a, float64 b STATUS_PARAM )
3028
{
3029
    flag aSign, bSign;
3030
    bits64 av, bv;
3031

    
3032
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034
       ) {
3035
        float_raise( float_flag_invalid STATUS_VAR);
3036
        return 0;
3037
    }
3038
    aSign = extractFloat64Sign( a );
3039
    bSign = extractFloat64Sign( b );
3040
    av = float64_val(a);
3041
    bv = float64_val(b);
3042
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3043
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3044

    
3045
}
3046

    
3047
/*----------------------------------------------------------------------------
3048
| Returns 1 if the double-precision floating-point value `a' is less than
3049
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3050
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3051
*----------------------------------------------------------------------------*/
3052

    
3053
int float64_lt( float64 a, float64 b STATUS_PARAM )
3054
{
3055
    flag aSign, bSign;
3056
    bits64 av, bv;
3057

    
3058
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3059
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3060
       ) {
3061
        float_raise( float_flag_invalid STATUS_VAR);
3062
        return 0;
3063
    }
3064
    aSign = extractFloat64Sign( a );
3065
    bSign = extractFloat64Sign( b );
3066
    av = float64_val(a);
3067
    bv = float64_val(b);
3068
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3069
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3070

    
3071
}
3072

    
3073
/*----------------------------------------------------------------------------
3074
| Returns 1 if the double-precision floating-point value `a' is equal to the
3075
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3076
| if either operand is a NaN.  Otherwise, the comparison is performed
3077
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3078
*----------------------------------------------------------------------------*/
3079

    
3080
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3081
{
3082
    bits64 av, bv;
3083

    
3084
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3085
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3086
       ) {
3087
        float_raise( float_flag_invalid STATUS_VAR);
3088
        return 0;
3089
    }
3090
    av = float64_val(a);
3091
    bv = float64_val(b);
3092
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3093

    
3094
}
3095

    
3096
/*----------------------------------------------------------------------------
3097
| Returns 1 if the double-precision floating-point value `a' is less than or
3098
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3099
| cause an exception.  Otherwise, the comparison is performed according to the
3100
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3101
*----------------------------------------------------------------------------*/
3102

    
3103
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3104
{
3105
    flag aSign, bSign;
3106
    bits64 av, bv;
3107

    
3108
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3109
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3110
       ) {
3111
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3112
            float_raise( float_flag_invalid STATUS_VAR);
3113
        }
3114
        return 0;
3115
    }
3116
    aSign = extractFloat64Sign( a );
3117
    bSign = extractFloat64Sign( b );
3118
    av = float64_val(a);
3119
    bv = float64_val(b);
3120
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3121
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3122

    
3123
}
3124

    
3125
/*----------------------------------------------------------------------------
3126
| Returns 1 if the double-precision floating-point value `a' is less than
3127
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3128
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3129
| Standard for Binary Floating-Point Arithmetic.
3130
*----------------------------------------------------------------------------*/
3131

    
3132
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3133
{
3134
    flag aSign, bSign;
3135
    bits64 av, bv;
3136

    
3137
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3138
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3139
       ) {
3140
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3141
            float_raise( float_flag_invalid STATUS_VAR);
3142
        }
3143
        return 0;
3144
    }
3145
    aSign = extractFloat64Sign( a );
3146
    bSign = extractFloat64Sign( b );
3147
    av = float64_val(a);
3148
    bv = float64_val(b);
3149
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3150
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3151

    
3152
}
3153

    
3154
#ifdef FLOATX80
3155

    
3156
/*----------------------------------------------------------------------------
3157
| Returns the result of converting the extended double-precision floating-
3158
| point value `a' to the 32-bit two's complement integer format.  The
3159
| conversion is performed according to the IEC/IEEE Standard for Binary
3160
| Floating-Point Arithmetic---which means in particular that the conversion
3161
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3162
| largest positive integer is returned.  Otherwise, if the conversion
3163
| overflows, the largest integer with the same sign as `a' is returned.
3164
*----------------------------------------------------------------------------*/
3165

    
3166
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3167
{
3168
    flag aSign;
3169
    int32 aExp, shiftCount;
3170
    bits64 aSig;
3171

    
3172
    aSig = extractFloatx80Frac( a );
3173
    aExp = extractFloatx80Exp( a );
3174
    aSign = extractFloatx80Sign( a );
3175
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3176
    shiftCount = 0x4037 - aExp;
3177
    if ( shiftCount <= 0 ) shiftCount = 1;
3178
    shift64RightJamming( aSig, shiftCount, &aSig );
3179
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3180

    
3181
}
3182

    
3183
/*----------------------------------------------------------------------------
3184
| Returns the result of converting the extended double-precision floating-
3185
| point value `a' to the 32-bit two's complement integer format.  The
3186
| conversion is performed according to the IEC/IEEE Standard for Binary
3187
| Floating-Point Arithmetic, except that the conversion is always rounded
3188
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3189
| Otherwise, if the conversion overflows, the largest integer with the same
3190
| sign as `a' is returned.
3191
*----------------------------------------------------------------------------*/
3192

    
3193
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3194
{
3195
    flag aSign;
3196
    int32 aExp, shiftCount;
3197
    bits64 aSig, savedASig;
3198
    int32 z;
3199

    
3200
    aSig = extractFloatx80Frac( a );
3201
    aExp = extractFloatx80Exp( a );
3202
    aSign = extractFloatx80Sign( a );
3203
    if ( 0x401E < aExp ) {
3204
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3205
        goto invalid;
3206
    }
3207
    else if ( aExp < 0x3FFF ) {
3208
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3209
        return 0;
3210
    }
3211
    shiftCount = 0x403E - aExp;
3212
    savedASig = aSig;
3213
    aSig >>= shiftCount;
3214
    z = aSig;
3215
    if ( aSign ) z = - z;
3216
    if ( ( z < 0 ) ^ aSign ) {
3217
 invalid:
3218
        float_raise( float_flag_invalid STATUS_VAR);
3219
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3220
    }
3221
    if ( ( aSig<<shiftCount ) != savedASig ) {
3222
        STATUS(float_exception_flags) |= float_flag_inexact;
3223
    }
3224
    return z;
3225

    
3226
}
3227

    
3228
/*----------------------------------------------------------------------------
3229
| Returns the result of converting the extended double-precision floating-
3230
| point value `a' to the 64-bit two's complement integer format.  The
3231
| conversion is performed according to the IEC/IEEE Standard for Binary
3232
| Floating-Point Arithmetic---which means in particular that the conversion
3233
| is rounded according to the current rounding mode.  If `a' is a NaN,
3234
| the largest positive integer is returned.  Otherwise, if the conversion
3235
| overflows, the largest integer with the same sign as `a' is returned.
3236
*----------------------------------------------------------------------------*/
3237

    
3238
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3239
{
3240
    flag aSign;
3241
    int32 aExp, shiftCount;
3242
    bits64 aSig, aSigExtra;
3243

    
3244
    aSig = extractFloatx80Frac( a );
3245
    aExp = extractFloatx80Exp( a );
3246
    aSign = extractFloatx80Sign( a );
3247
    shiftCount = 0x403E - aExp;
3248
    if ( shiftCount <= 0 ) {
3249
        if ( shiftCount ) {
3250
            float_raise( float_flag_invalid STATUS_VAR);
3251
            if (    ! aSign
3252
                 || (    ( aExp == 0x7FFF )
3253
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3254
               ) {
3255
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3256
            }
3257
            return (sbits64) LIT64( 0x8000000000000000 );
3258
        }
3259
        aSigExtra = 0;
3260
    }
3261
    else {
3262
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3263
    }
3264
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3265

    
3266
}
3267

    
3268
/*----------------------------------------------------------------------------
3269
| Returns the result of converting the extended double-precision floating-
3270
| point value `a' to the 64-bit two's complement integer format.  The
3271
| conversion is performed according to the IEC/IEEE Standard for Binary
3272
| Floating-Point Arithmetic, except that the conversion is always rounded
3273
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3274
| Otherwise, if the conversion overflows, the largest integer with the same
3275
| sign as `a' is returned.
3276
*----------------------------------------------------------------------------*/
3277

    
3278
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3279
{
3280
    flag aSign;
3281
    int32 aExp, shiftCount;
3282
    bits64 aSig;
3283
    int64 z;
3284

    
3285
    aSig = extractFloatx80Frac( a );
3286
    aExp = extractFloatx80Exp( a );
3287
    aSign = extractFloatx80Sign( a );
3288
    shiftCount = aExp - 0x403E;
3289
    if ( 0 <= shiftCount ) {
3290
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3291
        if ( ( a.high != 0xC03E ) || aSig ) {
3292
            float_raise( float_flag_invalid STATUS_VAR);
3293
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3294
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3295
            }
3296
        }
3297
        return (sbits64) LIT64( 0x8000000000000000 );
3298
    }
3299
    else if ( aExp < 0x3FFF ) {
3300
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3301
        return 0;
3302
    }
3303
    z = aSig>>( - shiftCount );
3304
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3305
        STATUS(float_exception_flags) |= float_flag_inexact;
3306
    }
3307
    if ( aSign ) z = - z;
3308
    return z;
3309

    
3310
}
3311

    
3312
/*----------------------------------------------------------------------------
3313
| Returns the result of converting the extended double-precision floating-
3314
| point value `a' to the single-precision floating-point format.  The
3315
| conversion is performed according to the IEC/IEEE Standard for Binary
3316
| Floating-Point Arithmetic.
3317
*----------------------------------------------------------------------------*/
3318

    
3319
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3320
{
3321
    flag aSign;
3322
    int32 aExp;
3323
    bits64 aSig;
3324

    
3325
    aSig = extractFloatx80Frac( a );
3326
    aExp = extractFloatx80Exp( a );
3327
    aSign = extractFloatx80Sign( a );
3328
    if ( aExp == 0x7FFF ) {
3329
        if ( (bits64) ( aSig<<1 ) ) {
3330
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3331
        }
3332
        return packFloat32( aSign, 0xFF, 0 );
3333
    }
3334
    shift64RightJamming( aSig, 33, &aSig );
3335
    if ( aExp || aSig ) aExp -= 0x3F81;
3336
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3337

    
3338
}
3339

    
3340
/*----------------------------------------------------------------------------
3341
| Returns the result of converting the extended double-precision floating-
3342
| point value `a' to the double-precision floating-point format.  The
3343
| conversion is performed according to the IEC/IEEE Standard for Binary
3344
| Floating-Point Arithmetic.
3345
*----------------------------------------------------------------------------*/
3346

    
3347
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3348
{
3349
    flag aSign;
3350
    int32 aExp;
3351
    bits64 aSig, zSig;
3352

    
3353
    aSig = extractFloatx80Frac( a );
3354
    aExp = extractFloatx80Exp( a );
3355
    aSign = extractFloatx80Sign( a );
3356
    if ( aExp == 0x7FFF ) {
3357
        if ( (bits64) ( aSig<<1 ) ) {
3358
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3359
        }
3360
        return packFloat64( aSign, 0x7FF, 0 );
3361
    }
3362
    shift64RightJamming( aSig, 1, &zSig );
3363
    if ( aExp || aSig ) aExp -= 0x3C01;
3364
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3365

    
3366
}
3367

    
3368
#ifdef FLOAT128
3369

    
3370
/*----------------------------------------------------------------------------
3371
| Returns the result of converting the extended double-precision floating-
3372
| point value `a' to the quadruple-precision floating-point format.  The
3373
| conversion is performed according to the IEC/IEEE Standard for Binary
3374
| Floating-Point Arithmetic.
3375
*----------------------------------------------------------------------------*/
3376

    
3377
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3378
{
3379
    flag aSign;
3380
    int16 aExp;
3381
    bits64 aSig, zSig0, zSig1;
3382

    
3383
    aSig = extractFloatx80Frac( a );
3384
    aExp = extractFloatx80Exp( a );
3385
    aSign = extractFloatx80Sign( a );
3386
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3387
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3388
    }
3389
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3390
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3391

    
3392
}
3393

    
3394
#endif
3395

    
3396
/*----------------------------------------------------------------------------
3397
| Rounds the extended double-precision floating-point value `a' to an integer,
3398
| and returns the result as an extended quadruple-precision floating-point
3399
| value.  The operation is performed according to the IEC/IEEE Standard for
3400
| Binary Floating-Point Arithmetic.
3401
*----------------------------------------------------------------------------*/
3402

    
3403
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3404
{
3405
    flag aSign;
3406
    int32 aExp;
3407
    bits64 lastBitMask, roundBitsMask;
3408
    int8 roundingMode;
3409
    floatx80 z;
3410

    
3411
    aExp = extractFloatx80Exp( a );
3412
    if ( 0x403E <= aExp ) {
3413
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3414
            return propagateFloatx80NaN( a, a STATUS_VAR );
3415
        }
3416
        return a;
3417
    }
3418
    if ( aExp < 0x3FFF ) {
3419
        if (    ( aExp == 0 )
3420
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3421
            return a;
3422
        }
3423
        STATUS(float_exception_flags) |= float_flag_inexact;
3424
        aSign = extractFloatx80Sign( a );
3425
        switch ( STATUS(float_rounding_mode) ) {
3426
         case float_round_nearest_even:
3427
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3428
               ) {
3429
                return
3430
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3431
            }
3432
            break;
3433
         case float_round_down:
3434
            return
3435
                  aSign ?
3436
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3437
                : packFloatx80( 0, 0, 0 );
3438
         case float_round_up:
3439
            return
3440
                  aSign ? packFloatx80( 1, 0, 0 )
3441
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3442
        }
3443
        return packFloatx80( aSign, 0, 0 );
3444
    }
3445
    lastBitMask = 1;
3446
    lastBitMask <<= 0x403E - aExp;
3447
    roundBitsMask = lastBitMask - 1;
3448
    z = a;
3449
    roundingMode = STATUS(float_rounding_mode);
3450
    if ( roundingMode == float_round_nearest_even ) {
3451
        z.low += lastBitMask>>1;
3452
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3453
    }
3454
    else if ( roundingMode != float_round_to_zero ) {
3455
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3456
            z.low += roundBitsMask;
3457
        }
3458
    }
3459
    z.low &= ~ roundBitsMask;
3460
    if ( z.low == 0 ) {
3461
        ++z.high;
3462
        z.low = LIT64( 0x8000000000000000 );
3463
    }
3464
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3465
    return z;
3466

    
3467
}
3468

    
3469
/*----------------------------------------------------------------------------
3470
| Returns the result of adding the absolute values of the extended double-
3471
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3472
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3473
| The addition is performed according to the IEC/IEEE Standard for Binary
3474
| Floating-Point Arithmetic.
3475
*----------------------------------------------------------------------------*/
3476

    
3477
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3478
{
3479
    int32 aExp, bExp, zExp;
3480
    bits64 aSig, bSig, zSig0, zSig1;
3481
    int32 expDiff;
3482

    
3483
    aSig = extractFloatx80Frac( a );
3484
    aExp = extractFloatx80Exp( a );
3485
    bSig = extractFloatx80Frac( b );
3486
    bExp = extractFloatx80Exp( b );
3487
    expDiff = aExp - bExp;
3488
    if ( 0 < expDiff ) {
3489
        if ( aExp == 0x7FFF ) {
3490
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3491
            return a;
3492
        }
3493
        if ( bExp == 0 ) --expDiff;
3494
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3495
        zExp = aExp;
3496
    }
3497
    else if ( expDiff < 0 ) {
3498
        if ( bExp == 0x7FFF ) {
3499
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3500
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3501
        }
3502
        if ( aExp == 0 ) ++expDiff;
3503
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3504
        zExp = bExp;
3505
    }
3506
    else {
3507
        if ( aExp == 0x7FFF ) {
3508
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3509
                return propagateFloatx80NaN( a, b STATUS_VAR );
3510
            }
3511
            return a;
3512
        }
3513
        zSig1 = 0;
3514
        zSig0 = aSig + bSig;
3515
        if ( aExp == 0 ) {
3516
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3517
            goto roundAndPack;
3518
        }
3519
        zExp = aExp;
3520
        goto shiftRight1;
3521
    }
3522
    zSig0 = aSig + bSig;
3523
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3524
 shiftRight1:
3525
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3526
    zSig0 |= LIT64( 0x8000000000000000 );
3527
    ++zExp;
3528
 roundAndPack:
3529
    return
3530
        roundAndPackFloatx80(
3531
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3532

    
3533
}
3534

    
3535
/*----------------------------------------------------------------------------
3536
| Returns the result of subtracting the absolute values of the extended
3537
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3538
| difference is negated before being returned.  `zSign' is ignored if the
3539
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3540
| Standard for Binary Floating-Point Arithmetic.
3541
*----------------------------------------------------------------------------*/
3542

    
3543
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3544
{
3545
    int32 aExp, bExp, zExp;
3546
    bits64 aSig, bSig, zSig0, zSig1;
3547
    int32 expDiff;
3548
    floatx80 z;
3549

    
3550
    aSig = extractFloatx80Frac( a );
3551
    aExp = extractFloatx80Exp( a );
3552
    bSig = extractFloatx80Frac( b );
3553
    bExp = extractFloatx80Exp( b );
3554
    expDiff = aExp - bExp;
3555
    if ( 0 < expDiff ) goto aExpBigger;
3556
    if ( expDiff < 0 ) goto bExpBigger;
3557
    if ( aExp == 0x7FFF ) {
3558
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3559
            return propagateFloatx80NaN( a, b STATUS_VAR );
3560
        }
3561
        float_raise( float_flag_invalid STATUS_VAR);
3562
        z.low = floatx80_default_nan_low;
3563
        z.high = floatx80_default_nan_high;
3564
        return z;
3565
    }
3566
    if ( aExp == 0 ) {
3567
        aExp = 1;
3568
        bExp = 1;
3569
    }
3570
    zSig1 = 0;
3571
    if ( bSig < aSig ) goto aBigger;
3572
    if ( aSig < bSig ) goto bBigger;
3573
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3574
 bExpBigger:
3575
    if ( bExp == 0x7FFF ) {
3576
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3577
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3578
    }
3579
    if ( aExp == 0 ) ++expDiff;
3580
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3581
 bBigger:
3582
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3583
    zExp = bExp;
3584
    zSign ^= 1;
3585
    goto normalizeRoundAndPack;
3586
 aExpBigger:
3587
    if ( aExp == 0x7FFF ) {
3588
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589
        return a;
3590
    }
3591
    if ( bExp == 0 ) --expDiff;
3592
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3593
 aBigger:
3594
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3595
    zExp = aExp;
3596
 normalizeRoundAndPack:
3597
    return
3598
        normalizeRoundAndPackFloatx80(
3599
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3600

    
3601
}
3602

    
3603
/*----------------------------------------------------------------------------
3604
| Returns the result of adding the extended double-precision floating-point
3605
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3606
| Standard for Binary Floating-Point Arithmetic.
3607
*----------------------------------------------------------------------------*/
3608

    
3609
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3610
{
3611
    flag aSign, bSign;
3612

    
3613
    aSign = extractFloatx80Sign( a );
3614
    bSign = extractFloatx80Sign( b );
3615
    if ( aSign == bSign ) {
3616
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3617
    }
3618
    else {
3619
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3620
    }
3621

    
3622
}
3623

    
3624
/*----------------------------------------------------------------------------
3625
| Returns the result of subtracting the extended double-precision floating-
3626
| point values `a' and `b'.  The operation is performed according to the
3627
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628
*----------------------------------------------------------------------------*/
3629

    
3630
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3631
{
3632
    flag aSign, bSign;
3633

    
3634
    aSign = extractFloatx80Sign( a );
3635
    bSign = extractFloatx80Sign( b );
3636
    if ( aSign == bSign ) {
3637
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3638
    }
3639
    else {
3640
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3641
    }
3642

    
3643
}
3644

    
3645
/*----------------------------------------------------------------------------
3646
| Returns the result of multiplying the extended double-precision floating-
3647
| point values `a' and `b'.  The operation is performed according to the
3648
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3649
*----------------------------------------------------------------------------*/
3650

    
3651
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3652
{
3653
    flag aSign, bSign, zSign;
3654
    int32 aExp, bExp, zExp;
3655
    bits64 aSig, bSig, zSig0, zSig1;
3656
    floatx80 z;
3657

    
3658
    aSig = extractFloatx80Frac( a );
3659
    aExp = extractFloatx80Exp( a );
3660
    aSign = extractFloatx80Sign( a );
3661
    bSig = extractFloatx80Frac( b );
3662
    bExp = extractFloatx80Exp( b );
3663
    bSign = extractFloatx80Sign( b );
3664
    zSign = aSign ^ bSign;
3665
    if ( aExp == 0x7FFF ) {
3666
        if (    (bits64) ( aSig<<1 )
3667
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3668
            return propagateFloatx80NaN( a, b STATUS_VAR );
3669
        }
3670
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3671
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672
    }
3673
    if ( bExp == 0x7FFF ) {
3674
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3675
        if ( ( aExp | aSig ) == 0 ) {
3676
 invalid:
3677
            float_raise( float_flag_invalid STATUS_VAR);
3678
            z.low = floatx80_default_nan_low;
3679
            z.high = floatx80_default_nan_high;
3680
            return z;
3681
        }
3682
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683
    }
3684
    if ( aExp == 0 ) {
3685
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3686
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3687
    }
3688
    if ( bExp == 0 ) {
3689
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3690
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3691
    }
3692
    zExp = aExp + bExp - 0x3FFE;
3693
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3694
    if ( 0 < (sbits64) zSig0 ) {
3695
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3696
        --zExp;
3697
    }
3698
    return
3699
        roundAndPackFloatx80(
3700
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3701

    
3702
}
3703

    
3704
/*----------------------------------------------------------------------------
3705
| Returns the result of dividing the extended double-precision floating-point
3706
| value `a' by the corresponding value `b'.  The operation is performed
3707
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708
*----------------------------------------------------------------------------*/
3709

    
3710
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3711
{
3712
    flag aSign, bSign, zSign;
3713
    int32 aExp, bExp, zExp;
3714
    bits64 aSig, bSig, zSig0, zSig1;
3715
    bits64 rem0, rem1, rem2, term0, term1, term2;
3716
    floatx80 z;
3717

    
3718
    aSig = extractFloatx80Frac( a );
3719
    aExp = extractFloatx80Exp( a );
3720
    aSign = extractFloatx80Sign( a );
3721
    bSig = extractFloatx80Frac( b );
3722
    bExp = extractFloatx80Exp( b );
3723
    bSign = extractFloatx80Sign( b );
3724
    zSign = aSign ^ bSign;
3725
    if ( aExp == 0x7FFF ) {
3726
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3727
        if ( bExp == 0x7FFF ) {
3728
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3729
            goto invalid;
3730
        }
3731
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3732
    }
3733
    if ( bExp == 0x7FFF ) {
3734
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3735
        return packFloatx80( zSign, 0, 0 );
3736
    }
3737
    if ( bExp == 0 ) {
3738
        if ( bSig == 0 ) {
3739
            if ( ( aExp | aSig ) == 0 ) {
3740
 invalid:
3741
                float_raise( float_flag_invalid STATUS_VAR);
3742
                z.low = floatx80_default_nan_low;
3743
                z.high = floatx80_default_nan_high;
3744
                return z;
3745
            }
3746
            float_raise( float_flag_divbyzero STATUS_VAR);
3747
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3748
        }
3749
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3750
    }
3751
    if ( aExp == 0 ) {
3752
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3753
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3754
    }
3755
    zExp = aExp - bExp + 0x3FFE;
3756
    rem1 = 0;
3757
    if ( bSig <= aSig ) {
3758
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3759
        ++zExp;
3760
    }
3761
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3762
    mul64To128( bSig, zSig0, &term0, &term1 );
3763
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3764
    while ( (sbits64) rem0 < 0 ) {
3765
        --zSig0;
3766
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3767
    }
3768
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3769
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3770
        mul64To128( bSig, zSig1, &term1, &term2 );
3771
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3772
        while ( (sbits64) rem1 < 0 ) {
3773
            --zSig1;
3774
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3775
        }
3776
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3777
    }
3778
    return
3779
        roundAndPackFloatx80(
3780
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3781

    
3782
}
3783

    
3784
/*----------------------------------------------------------------------------
3785
| Returns the remainder of the extended double-precision floating-point value
3786
| `a' with respect to the corresponding value `b'.  The operation is performed
3787
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788
*----------------------------------------------------------------------------*/
3789

    
3790
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3791
{
3792
    flag aSign, bSign, zSign;
3793
    int32 aExp, bExp, expDiff;
3794
    bits64 aSig0, aSig1, bSig;
3795
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3796
    floatx80 z;
3797

    
3798
    aSig0 = extractFloatx80Frac( a );
3799
    aExp = extractFloatx80Exp( a );
3800
    aSign = extractFloatx80Sign( a );
3801
    bSig = extractFloatx80Frac( b );
3802
    bExp = extractFloatx80Exp( b );
3803
    bSign = extractFloatx80Sign( b );
3804
    if ( aExp == 0x7FFF ) {
3805
        if (    (bits64) ( aSig0<<1 )
3806
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807
            return propagateFloatx80NaN( a, b STATUS_VAR );
3808
        }
3809
        goto invalid;
3810
    }
3811
    if ( bExp == 0x7FFF ) {
3812
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3813
        return a;
3814
    }
3815
    if ( bExp == 0 ) {
3816
        if ( bSig == 0 ) {
3817
 invalid:
3818
            float_raise( float_flag_invalid STATUS_VAR);
3819
            z.low = floatx80_default_nan_low;
3820
            z.high = floatx80_default_nan_high;
3821
            return z;
3822
        }
3823
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3824
    }
3825
    if ( aExp == 0 ) {
3826
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3827
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3828
    }
3829
    bSig |= LIT64( 0x8000000000000000 );
3830
    zSign = aSign;
3831
    expDiff = aExp - bExp;
3832
    aSig1 = 0;
3833
    if ( expDiff < 0 ) {
3834
        if ( expDiff < -1 ) return a;
3835
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3836
        expDiff = 0;
3837
    }
3838
    q = ( bSig <= aSig0 );
3839
    if ( q ) aSig0 -= bSig;
3840
    expDiff -= 64;
3841
    while ( 0 < expDiff ) {
3842
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3843
        q = ( 2 < q ) ? q - 2 : 0;
3844
        mul64To128( bSig, q, &term0, &term1 );
3845
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3846
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3847
        expDiff -= 62;
3848
    }
3849
    expDiff += 64;
3850
    if ( 0 < expDiff ) {
3851
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3852
        q = ( 2 < q ) ? q - 2 : 0;
3853
        q >>= 64 - expDiff;
3854
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3855
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3856
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3857
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3858
            ++q;
3859
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3860
        }
3861
    }
3862
    else {
3863
        term1 = 0;
3864
        term0 = bSig;
3865
    }
3866
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3867
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3868
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3869
              && ( q & 1 ) )
3870
       ) {
3871
        aSig0 = alternateASig0;
3872
        aSig1 = alternateASig1;
3873
        zSign = ! zSign;
3874
    }
3875
    return
3876
        normalizeRoundAndPackFloatx80(
3877
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3878

    
3879
}
3880

    
3881
/*----------------------------------------------------------------------------
3882
| Returns the square root of the extended double-precision floating-point
3883
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3884
| for Binary Floating-Point Arithmetic.
3885
*----------------------------------------------------------------------------*/
3886

    
3887
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3888
{
3889
    flag aSign;
3890
    int32 aExp, zExp;
3891
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3892
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3893
    floatx80 z;
3894

    
3895
    aSig0 = extractFloatx80Frac( a );
3896
    aExp = extractFloatx80Exp( a );
3897
    aSign = extractFloatx80Sign( a );
3898
    if ( aExp == 0x7FFF ) {
3899
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3900
        if ( ! aSign ) return a;
3901
        goto invalid;
3902
    }
3903
    if ( aSign ) {
3904
        if ( ( aExp | aSig0 ) == 0 ) return a;
3905
 invalid:
3906
        float_raise( float_flag_invalid STATUS_VAR);
3907
        z.low = floatx80_default_nan_low;
3908
        z.high = floatx80_default_nan_high;
3909
        return z;
3910
    }
3911
    if ( aExp == 0 ) {
3912
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3913
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3914
    }
3915
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3916
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3917
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3918
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3919
    doubleZSig0 = zSig0<<1;
3920
    mul64To128( zSig0, zSig0, &term0, &term1 );
3921
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3922
    while ( (sbits64) rem0 < 0 ) {
3923
        --zSig0;
3924
        doubleZSig0 -= 2;
3925
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3926
    }
3927
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3928
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3929
        if ( zSig1 == 0 ) zSig1 = 1;
3930
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3931
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3932
        mul64To128( zSig1, zSig1, &term2, &term3 );
3933
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3934
        while ( (sbits64) rem1 < 0 ) {
3935
            --zSig1;
3936
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3937
            term3 |= 1;
3938
            term2 |= doubleZSig0;
3939
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3940
        }
3941
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3942
    }
3943
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3944
    zSig0 |= doubleZSig0;
3945
    return
3946
        roundAndPackFloatx80(
3947
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3948

    
3949
}
3950

    
3951
/*----------------------------------------------------------------------------
3952
| Returns 1 if the extended double-precision floating-point value `a' is
3953
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3954
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3955
| Arithmetic.
3956
*----------------------------------------------------------------------------*/
3957

    
3958
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3959
{
3960

    
3961
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3962
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3963
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3964
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3965
       ) {
3966
        if (    floatx80_is_signaling_nan( a )
3967
             || floatx80_is_signaling_nan( b ) ) {
3968
            float_raise( float_flag_invalid STATUS_VAR);
3969
        }
3970
        return 0;
3971
    }
3972
    return
3973
           ( a.low == b.low )
3974
        && (    ( a.high == b.high )
3975
             || (    ( a.low == 0 )
3976
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3977
           );
3978

    
3979
}
3980

    
3981
/*----------------------------------------------------------------------------
3982
| Returns 1 if the extended double-precision floating-point value `a' is
3983
| less than or equal to the corresponding value `b', and 0 otherwise.  The
3984
| comparison is performed according to the IEC/IEEE Standard for Binary
3985
| Floating-Point Arithmetic.
3986
*----------------------------------------------------------------------------*/
3987

    
3988
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3989
{
3990
    flag aSign, bSign;
3991

    
3992
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3993
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3995
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996
       ) {
3997
        float_raise( float_flag_invalid STATUS_VAR);
3998
        return 0;
3999
    }
4000
    aSign = extractFloatx80Sign( a );
4001
    bSign = extractFloatx80Sign( b );
4002
    if ( aSign != bSign ) {
4003
        return
4004
               aSign
4005
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4006
                 == 0 );
4007
    }
4008
    return
4009
          aSign ? le128( b.high, b.low, a.high, a.low )
4010
        : le128( a.high, a.low, b.high, b.low );
4011

    
4012
}
4013

    
4014
/*----------------------------------------------------------------------------
4015
| Returns 1 if the extended double-precision floating-point value `a' is
4016
| less than the corresponding value `b', and 0 otherwise.  The comparison
4017
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4018
| Arithmetic.
4019
*----------------------------------------------------------------------------*/
4020

    
4021
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4022
{
4023
    flag aSign, bSign;
4024

    
4025
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4026
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4027
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4028
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4029
       ) {
4030
        float_raise( float_flag_invalid STATUS_VAR);
4031
        return 0;
4032
    }
4033
    aSign = extractFloatx80Sign( a );
4034
    bSign = extractFloatx80Sign( b );
4035
    if ( aSign != bSign ) {
4036
        return
4037
               aSign
4038
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4039
                 != 0 );
4040
    }
4041
    return
4042
          aSign ? lt128( b.high, b.low, a.high, a.low )
4043
        : lt128( a.high, a.low, b.high, b.low );
4044

    
4045
}
4046

    
4047
/*----------------------------------------------------------------------------
4048
| Returns 1 if the extended double-precision floating-point value `a' is equal
4049
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4050
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4051
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052
*----------------------------------------------------------------------------*/
4053

    
4054
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4055
{
4056

    
4057
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4058
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4059
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4060
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4061
       ) {
4062
        float_raise( float_flag_invalid STATUS_VAR);
4063
        return 0;
4064
    }
4065
    return
4066
           ( a.low == b.low )
4067
        && (    ( a.high == b.high )
4068
             || (    ( a.low == 0 )
4069
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4070
           );
4071

    
4072
}
4073

    
4074
/*----------------------------------------------------------------------------
4075
| Returns 1 if the extended double-precision floating-point value `a' is less
4076
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4077
| do not cause an exception.  Otherwise, the comparison is performed according
4078
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4079
*----------------------------------------------------------------------------*/
4080

    
4081
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4082
{
4083
    flag aSign, bSign;
4084

    
4085
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4086
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4087
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4088
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4089
       ) {
4090
        if (    floatx80_is_signaling_nan( a )
4091
             || floatx80_is_signaling_nan( b ) ) {
4092
            float_raise( float_flag_invalid STATUS_VAR);
4093
        }
4094
        return 0;
4095
    }
4096
    aSign = extractFloatx80Sign( a );
4097
    bSign = extractFloatx80Sign( b );
4098
    if ( aSign != bSign ) {
4099
        return
4100
               aSign
4101
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4102
                 == 0 );
4103
    }
4104
    return
4105
          aSign ? le128( b.high, b.low, a.high, a.low )
4106
        : le128( a.high, a.low, b.high, b.low );
4107

    
4108
}
4109

    
4110
/*----------------------------------------------------------------------------
4111
| Returns 1 if the extended double-precision floating-point value `a' is less
4112
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4113
| an exception.  Otherwise, the comparison is performed according to the
4114
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4115
*----------------------------------------------------------------------------*/
4116

    
4117
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4118
{
4119
    flag aSign, bSign;
4120

    
4121
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4122
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4123
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4124
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4125
       ) {
4126
        if (    floatx80_is_signaling_nan( a )
4127
             || floatx80_is_signaling_nan( b ) ) {
4128
            float_raise( float_flag_invalid STATUS_VAR);
4129
        }
4130
        return 0;
4131
    }
4132
    aSign = extractFloatx80Sign( a );
4133
    bSign = extractFloatx80Sign( b );
4134
    if ( aSign != bSign ) {
4135
        return
4136
               aSign
4137
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4138
                 != 0 );
4139
    }
4140
    return
4141
          aSign ? lt128( b.high, b.low, a.high, a.low )
4142
        : lt128( a.high, a.low, b.high, b.low );
4143

    
4144
}
4145

    
4146
#endif
4147

    
4148
#ifdef FLOAT128
4149

    
4150
/*----------------------------------------------------------------------------
4151
| Returns the result of converting the quadruple-precision floating-point
4152
| value `a' to the 32-bit two's complement integer format.  The conversion
4153
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4154
| Arithmetic---which means in particular that the conversion is rounded
4155
| according to the current rounding mode.  If `a' is a NaN, the largest
4156
| positive integer is returned.  Otherwise, if the conversion overflows, the
4157
| largest integer with the same sign as `a' is returned.
4158
*----------------------------------------------------------------------------*/
4159

    
4160
int32 float128_to_int32( float128 a STATUS_PARAM )
4161
{
4162
    flag aSign;
4163
    int32 aExp, shiftCount;
4164
    bits64 aSig0, aSig1;
4165

    
4166
    aSig1 = extractFloat128Frac1( a );
4167
    aSig0 = extractFloat128Frac0( a );
4168
    aExp = extractFloat128Exp( a );
4169
    aSign = extractFloat128Sign( a );
4170
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4171
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172
    aSig0 |= ( aSig1 != 0 );
4173
    shiftCount = 0x4028 - aExp;
4174
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4175
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4176

    
4177
}
4178

    
4179
/*----------------------------------------------------------------------------
4180
| Returns the result of converting the quadruple-precision floating-point
4181
| value `a' to the 32-bit two's complement integer format.  The conversion
4182
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4183
| Arithmetic, except that the conversion is always rounded toward zero.  If
4184
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4185
| conversion overflows, the largest integer with the same sign as `a' is
4186
| returned.
4187
*----------------------------------------------------------------------------*/
4188

    
4189
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4190
{
4191
    flag aSign;
4192
    int32 aExp, shiftCount;
4193
    bits64 aSig0, aSig1, savedASig;
4194
    int32 z;
4195

    
4196
    aSig1 = extractFloat128Frac1( a );
4197
    aSig0 = extractFloat128Frac0( a );
4198
    aExp = extractFloat128Exp( a );
4199
    aSign = extractFloat128Sign( a );
4200
    aSig0 |= ( aSig1 != 0 );
4201
    if ( 0x401E < aExp ) {
4202
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4203
        goto invalid;
4204
    }
4205
    else if ( aExp < 0x3FFF ) {
4206
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4207
        return 0;
4208
    }
4209
    aSig0 |= LIT64( 0x0001000000000000 );
4210
    shiftCount = 0x402F - aExp;
4211
    savedASig = aSig0;
4212
    aSig0 >>= shiftCount;
4213
    z = aSig0;
4214
    if ( aSign ) z = - z;
4215
    if ( ( z < 0 ) ^ aSign ) {
4216
 invalid:
4217
        float_raise( float_flag_invalid STATUS_VAR);
4218
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4219
    }
4220
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4221
        STATUS(float_exception_flags) |= float_flag_inexact;
4222
    }
4223
    return z;
4224

    
4225
}
4226

    
4227
/*----------------------------------------------------------------------------
4228
| Returns the result of converting the quadruple-precision floating-point
4229
| value `a' to the 64-bit two's complement integer format.  The conversion
4230
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4231
| Arithmetic---which means in particular that the conversion is rounded
4232
| according to the current rounding mode.  If `a' is a NaN, the largest
4233
| positive integer is returned.  Otherwise, if the conversion overflows, the
4234
| largest integer with the same sign as `a' is returned.
4235
*----------------------------------------------------------------------------*/
4236

    
4237
int64 float128_to_int64( float128 a STATUS_PARAM )
4238
{
4239
    flag aSign;
4240
    int32 aExp, shiftCount;
4241
    bits64 aSig0, aSig1;
4242

    
4243
    aSig1 = extractFloat128Frac1( a );
4244
    aSig0 = extractFloat128Frac0( a );
4245
    aExp = extractFloat128Exp( a );
4246
    aSign = extractFloat128Sign( a );
4247
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4248
    shiftCount = 0x402F - aExp;
4249
    if ( shiftCount <= 0 ) {
4250
        if ( 0x403E < aExp ) {
4251
            float_raise( float_flag_invalid STATUS_VAR);
4252
            if (    ! aSign
4253
                 || (    ( aExp == 0x7FFF )
4254
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4255
                    )
4256
               ) {
4257
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4258
            }
4259
            return (sbits64) LIT64( 0x8000000000000000 );
4260
        }
4261
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4262
    }
4263
    else {
4264
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4265
    }
4266
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4267

    
4268
}
4269

    
4270
/*----------------------------------------------------------------------------
4271
| Returns the result of converting the quadruple-precision floating-point
4272
| value `a' to the 64-bit two's complement integer format.  The conversion
4273
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4274
| Arithmetic, except that the conversion is always rounded toward zero.
4275
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4276
| the conversion overflows, the largest integer with the same sign as `a' is
4277
| returned.
4278
*----------------------------------------------------------------------------*/
4279

    
4280
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4281
{
4282
    flag aSign;
4283
    int32 aExp, shiftCount;
4284
    bits64 aSig0, aSig1;
4285
    int64 z;
4286

    
4287
    aSig1 = extractFloat128Frac1( a );
4288
    aSig0 = extractFloat128Frac0( a );
4289
    aExp = extractFloat128Exp( a );
4290
    aSign = extractFloat128Sign( a );
4291
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4292
    shiftCount = aExp - 0x402F;
4293
    if ( 0 < shiftCount ) {
4294
        if ( 0x403E <= aExp ) {
4295
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4296
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4297
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4298
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4299
            }
4300
            else {
4301
                float_raise( float_flag_invalid STATUS_VAR);
4302
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4303
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4304
                }
4305
            }
4306
            return (sbits64) LIT64( 0x8000000000000000 );
4307
        }
4308
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4309
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4310
            STATUS(float_exception_flags) |= float_flag_inexact;
4311
        }
4312
    }
4313
    else {
4314
        if ( aExp < 0x3FFF ) {
4315
            if ( aExp | aSig0 | aSig1 ) {
4316
                STATUS(float_exception_flags) |= float_flag_inexact;
4317
            }
4318
            return 0;
4319
        }
4320
        z = aSig0>>( - shiftCount );
4321
        if (    aSig1
4322
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4323
            STATUS(float_exception_flags) |= float_flag_inexact;
4324
        }
4325
    }
4326
    if ( aSign ) z = - z;
4327
    return z;
4328

    
4329
}
4330

    
4331
/*----------------------------------------------------------------------------
4332
| Returns the result of converting the quadruple-precision floating-point
4333
| value `a' to the single-precision floating-point format.  The conversion
4334
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4335
| Arithmetic.
4336
*----------------------------------------------------------------------------*/
4337

    
4338
float32 float128_to_float32( float128 a STATUS_PARAM )
4339
{
4340
    flag aSign;
4341
    int32 aExp;
4342
    bits64 aSig0, aSig1;
4343
    bits32 zSig;
4344

    
4345
    aSig1 = extractFloat128Frac1( a );
4346
    aSig0 = extractFloat128Frac0( a );
4347
    aExp = extractFloat128Exp( a );
4348
    aSign = extractFloat128Sign( a );
4349
    if ( aExp == 0x7FFF ) {
4350
        if ( aSig0 | aSig1 ) {
4351
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4352
        }
4353
        return packFloat32( aSign, 0xFF, 0 );
4354
    }
4355
    aSig0 |= ( aSig1 != 0 );
4356
    shift64RightJamming( aSig0, 18, &aSig0 );
4357
    zSig = aSig0;
4358
    if ( aExp || zSig ) {
4359
        zSig |= 0x40000000;
4360
        aExp -= 0x3F81;
4361
    }
4362
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4363

    
4364
}
4365

    
4366
/*----------------------------------------------------------------------------
4367
| Returns the result of converting the quadruple-precision floating-point
4368
| value `a' to the double-precision floating-point format.  The conversion
4369
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4370
| Arithmetic.
4371
*----------------------------------------------------------------------------*/
4372

    
4373
float64 float128_to_float64( float128 a STATUS_PARAM )
4374
{
4375
    flag aSign;
4376
    int32 aExp;
4377
    bits64 aSig0, aSig1;
4378

    
4379
    aSig1 = extractFloat128Frac1( a );
4380
    aSig0 = extractFloat128Frac0( a );
4381
    aExp = extractFloat128Exp( a );
4382
    aSign = extractFloat128Sign( a );
4383
    if ( aExp == 0x7FFF ) {
4384
        if ( aSig0 | aSig1 ) {
4385
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4386
        }
4387
        return packFloat64( aSign, 0x7FF, 0 );
4388
    }
4389
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4390
    aSig0 |= ( aSig1 != 0 );
4391
    if ( aExp || aSig0 ) {
4392
        aSig0 |= LIT64( 0x4000000000000000 );
4393
        aExp -= 0x3C01;
4394
    }
4395
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4396

    
4397
}
4398

    
4399
#ifdef FLOATX80
4400

    
4401
/*----------------------------------------------------------------------------
4402
| Returns the result of converting the quadruple-precision floating-point
4403
| value `a' to the extended double-precision floating-point format.  The
4404
| conversion is performed according to the IEC/IEEE Standard for Binary
4405
| Floating-Point Arithmetic.
4406
*----------------------------------------------------------------------------*/
4407

    
4408
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4409
{
4410
    flag aSign;
4411
    int32 aExp;
4412
    bits64 aSig0, aSig1;
4413

    
4414
    aSig1 = extractFloat128Frac1( a );
4415
    aSig0 = extractFloat128Frac0( a );
4416
    aExp = extractFloat128Exp( a );
4417
    aSign = extractFloat128Sign( a );
4418
    if ( aExp == 0x7FFF ) {
4419
        if ( aSig0 | aSig1 ) {
4420
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4421
        }
4422
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4423
    }
4424
    if ( aExp == 0 ) {
4425
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4426
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4427
    }
4428
    else {
4429
        aSig0 |= LIT64( 0x0001000000000000 );
4430
    }
4431
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4432
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4433

    
4434
}
4435

    
4436
#endif
4437

    
4438
/*----------------------------------------------------------------------------
4439
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4440
| returns the result as a quadruple-precision floating-point value.  The
4441
| operation is performed according to the IEC/IEEE Standard for Binary
4442
| Floating-Point Arithmetic.
4443
*----------------------------------------------------------------------------*/
4444

    
4445
float128 float128_round_to_int( float128 a STATUS_PARAM )
4446
{
4447
    flag aSign;
4448
    int32 aExp;
4449
    bits64 lastBitMask, roundBitsMask;
4450
    int8 roundingMode;
4451
    float128 z;
4452

    
4453
    aExp = extractFloat128Exp( a );
4454
    if ( 0x402F <= aExp ) {
4455
        if ( 0x406F <= aExp ) {
4456
            if (    ( aExp == 0x7FFF )
4457
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4458
               ) {
4459
                return propagateFloat128NaN( a, a STATUS_VAR );
4460
            }
4461
            return a;
4462
        }
4463
        lastBitMask = 1;
4464
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4465
        roundBitsMask = lastBitMask - 1;
4466
        z = a;
4467
        roundingMode = STATUS(float_rounding_mode);
4468
        if ( roundingMode == float_round_nearest_even ) {
4469
            if ( lastBitMask ) {
4470
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4471
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4472
            }
4473
            else {
4474
                if ( (sbits64) z.low < 0 ) {
4475
                    ++z.high;
4476
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4477
                }
4478
            }
4479
        }
4480
        else if ( roundingMode != float_round_to_zero ) {
4481
            if (   extractFloat128Sign( z )
4482
                 ^ ( roundingMode == float_round_up ) ) {
4483
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4484
            }
4485
        }
4486
        z.low &= ~ roundBitsMask;
4487
    }
4488
    else {
4489
        if ( aExp < 0x3FFF ) {
4490
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4491
            STATUS(float_exception_flags) |= float_flag_inexact;
4492
            aSign = extractFloat128Sign( a );
4493
            switch ( STATUS(float_rounding_mode) ) {
4494
             case float_round_nearest_even:
4495
                if (    ( aExp == 0x3FFE )
4496
                     && (   extractFloat128Frac0( a )
4497
                          | extractFloat128Frac1( a ) )
4498
                   ) {
4499
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4500
                }
4501
                break;
4502
             case float_round_down:
4503
                return
4504
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4505
                    : packFloat128( 0, 0, 0, 0 );
4506
             case float_round_up:
4507
                return
4508
                      aSign ? packFloat128( 1, 0, 0, 0 )
4509
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4510
            }
4511
            return packFloat128( aSign, 0, 0, 0 );
4512
        }
4513
        lastBitMask = 1;
4514
        lastBitMask <<= 0x402F - aExp;
4515
        roundBitsMask = lastBitMask - 1;
4516
        z.low = 0;
4517
        z.high = a.high;
4518
        roundingMode = STATUS(float_rounding_mode);
4519
        if ( roundingMode == float_round_nearest_even ) {
4520
            z.high += lastBitMask>>1;
4521
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4522
                z.high &= ~ lastBitMask;
4523
            }
4524
        }
4525
        else if ( roundingMode != float_round_to_zero ) {
4526
            if (   extractFloat128Sign( z )
4527
                 ^ ( roundingMode == float_round_up ) ) {
4528
                z.high |= ( a.low != 0 );
4529
                z.high += roundBitsMask;
4530
            }
4531
        }
4532
        z.high &= ~ roundBitsMask;
4533
    }
4534
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4535
        STATUS(float_exception_flags) |= float_flag_inexact;
4536
    }
4537
    return z;
4538

    
4539
}
4540

    
4541
/*----------------------------------------------------------------------------
4542
| Returns the result of adding the absolute values of the quadruple-precision
4543
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4544
| before being returned.  `zSign' is ignored if the result is a NaN.
4545
| The addition is performed according to the IEC/IEEE Standard for Binary
4546
| Floating-Point Arithmetic.
4547
*----------------------------------------------------------------------------*/
4548

    
4549
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4550
{
4551
    int32 aExp, bExp, zExp;
4552
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4553
    int32 expDiff;
4554

    
4555
    aSig1 = extractFloat128Frac1( a );
4556
    aSig0 = extractFloat128Frac0( a );
4557
    aExp = extractFloat128Exp( a );
4558
    bSig1 = extractFloat128Frac1( b );
4559
    bSig0 = extractFloat128Frac0( b );
4560
    bExp = extractFloat128Exp( b );
4561
    expDiff = aExp - bExp;
4562
    if ( 0 < expDiff ) {
4563
        if ( aExp == 0x7FFF ) {
4564
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4565
            return a;
4566
        }
4567
        if ( bExp == 0 ) {
4568
            --expDiff;
4569
        }
4570
        else {
4571
            bSig0 |= LIT64( 0x0001000000000000 );
4572
        }
4573
        shift128ExtraRightJamming(
4574
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4575
        zExp = aExp;
4576
    }
4577
    else if ( expDiff < 0 ) {
4578
        if ( bExp == 0x7FFF ) {
4579
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4580
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4581
        }
4582
        if ( aExp == 0 ) {
4583
            ++expDiff;
4584
        }
4585
        else {
4586
            aSig0 |= LIT64( 0x0001000000000000 );
4587
        }
4588
        shift128ExtraRightJamming(
4589
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4590
        zExp = bExp;
4591
    }
4592
    else {
4593
        if ( aExp == 0x7FFF ) {
4594
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4595
                return propagateFloat128NaN( a, b STATUS_VAR );
4596
            }
4597
            return a;
4598
        }
4599
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4600
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4601
        zSig2 = 0;
4602
        zSig0 |= LIT64( 0x0002000000000000 );
4603
        zExp = aExp;
4604
        goto shiftRight1;
4605
    }
4606
    aSig0 |= LIT64( 0x0001000000000000 );
4607
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4608
    --zExp;
4609
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4610
    ++zExp;
4611
 shiftRight1:
4612
    shift128ExtraRightJamming(
4613
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4614
 roundAndPack:
4615
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4616

    
4617
}
4618

    
4619
/*----------------------------------------------------------------------------
4620
| Returns the result of subtracting the absolute values of the quadruple-
4621
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4622
| difference is negated before being returned.  `zSign' is ignored if the
4623
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4624
| Standard for Binary Floating-Point Arithmetic.
4625
*----------------------------------------------------------------------------*/
4626

    
4627
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4628
{
4629
    int32 aExp, bExp, zExp;
4630
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4631
    int32 expDiff;
4632
    float128 z;
4633

    
4634
    aSig1 = extractFloat128Frac1( a );
4635
    aSig0 = extractFloat128Frac0( a );
4636
    aExp = extractFloat128Exp( a );
4637
    bSig1 = extractFloat128Frac1( b );
4638
    bSig0 = extractFloat128Frac0( b );
4639
    bExp = extractFloat128Exp( b );
4640
    expDiff = aExp - bExp;
4641
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4642
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4643
    if ( 0 < expDiff ) goto aExpBigger;
4644
    if ( expDiff < 0 ) goto bExpBigger;
4645
    if ( aExp == 0x7FFF ) {
4646
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4647
            return propagateFloat128NaN( a, b STATUS_VAR );
4648
        }
4649
        float_raise( float_flag_invalid STATUS_VAR);
4650
        z.low = float128_default_nan_low;
4651
        z.high = float128_default_nan_high;
4652
        return z;
4653
    }
4654
    if ( aExp == 0 ) {
4655
        aExp = 1;
4656
        bExp = 1;
4657
    }
4658
    if ( bSig0 < aSig0 ) goto aBigger;
4659
    if ( aSig0 < bSig0 ) goto bBigger;
4660
    if ( bSig1 < aSig1 ) goto aBigger;
4661
    if ( aSig1 < bSig1 ) goto bBigger;
4662
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4663
 bExpBigger:
4664
    if ( bExp == 0x7FFF ) {
4665
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4666
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4667
    }
4668
    if ( aExp == 0 ) {
4669
        ++expDiff;
4670
    }
4671
    else {
4672
        aSig0 |= LIT64( 0x4000000000000000 );
4673
    }
4674
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4675
    bSig0 |= LIT64( 0x4000000000000000 );
4676
 bBigger:
4677
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4678
    zExp = bExp;
4679
    zSign ^= 1;
4680
    goto normalizeRoundAndPack;
4681
 aExpBigger:
4682
    if ( aExp == 0x7FFF ) {
4683
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4684
        return a;
4685
    }
4686
    if ( bExp == 0 ) {
4687
        --expDiff;
4688
    }
4689
    else {
4690
        bSig0 |= LIT64( 0x4000000000000000 );
4691
    }
4692
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4693
    aSig0 |= LIT64( 0x4000000000000000 );
4694
 aBigger:
4695
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4696
    zExp = aExp;
4697
 normalizeRoundAndPack:
4698
    --zExp;
4699
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4700

    
4701
}
4702

    
4703
/*----------------------------------------------------------------------------
4704
| Returns the result of adding the quadruple-precision floating-point values
4705
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4706
| for Binary Floating-Point Arithmetic.
4707
*----------------------------------------------------------------------------*/
4708

    
4709
float128 float128_add( float128 a, float128 b STATUS_PARAM )
4710
{
4711
    flag aSign, bSign;
4712

    
4713
    aSign = extractFloat128Sign( a );
4714
    bSign = extractFloat128Sign( b );
4715
    if ( aSign == bSign ) {
4716
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4717
    }
4718
    else {
4719
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4720
    }
4721

    
4722
}
4723

    
4724
/*----------------------------------------------------------------------------
4725
| Returns the result of subtracting the quadruple-precision floating-point
4726
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4727
| Standard for Binary Floating-Point Arithmetic.
4728
*----------------------------------------------------------------------------*/
4729

    
4730
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4731
{
4732
    flag aSign, bSign;
4733

    
4734
    aSign = extractFloat128Sign( a );
4735
    bSign = extractFloat128Sign( b );
4736
    if ( aSign == bSign ) {
4737
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4738
    }
4739
    else {
4740
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4741
    }
4742

    
4743
}
4744

    
4745
/*----------------------------------------------------------------------------
4746
| Returns the result of multiplying the quadruple-precision floating-point
4747
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4748
| Standard for Binary Floating-Point Arithmetic.
4749
*----------------------------------------------------------------------------*/
4750

    
4751
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4752
{
4753
    flag aSign, bSign, zSign;
4754
    int32 aExp, bExp, zExp;
4755
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4756
    float128 z;
4757

    
4758
    aSig1 = extractFloat128Frac1( a );
4759
    aSig0 = extractFloat128Frac0( a );
4760
    aExp = extractFloat128Exp( a );
4761
    aSign = extractFloat128Sign( a );
4762
    bSig1 = extractFloat128Frac1( b );
4763
    bSig0 = extractFloat128Frac0( b );
4764
    bExp = extractFloat128Exp( b );
4765
    bSign = extractFloat128Sign( b );
4766
    zSign = aSign ^ bSign;
4767
    if ( aExp == 0x7FFF ) {
4768
        if (    ( aSig0 | aSig1 )
4769
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4770
            return propagateFloat128NaN( a, b STATUS_VAR );
4771
        }
4772
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4773
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4774
    }
4775
    if ( bExp == 0x7FFF ) {
4776
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4778
 invalid:
4779
            float_raise( float_flag_invalid STATUS_VAR);
4780
            z.low = float128_default_nan_low;
4781
            z.high = float128_default_nan_high;
4782
            return z;
4783
        }
4784
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4785
    }
4786
    if ( aExp == 0 ) {
4787
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4788
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4789
    }
4790
    if ( bExp == 0 ) {
4791
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4792
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4793
    }
4794
    zExp = aExp + bExp - 0x4000;
4795
    aSig0 |= LIT64( 0x0001000000000000 );
4796
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4797
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4798
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4799
    zSig2 |= ( zSig3 != 0 );
4800
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4801
        shift128ExtraRightJamming(
4802
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4803
        ++zExp;
4804
    }
4805
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4806

    
4807
}
4808

    
4809
/*----------------------------------------------------------------------------
4810
| Returns the result of dividing the quadruple-precision floating-point value
4811
| `a' by the corresponding value `b'.  The operation is performed according to
4812
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813
*----------------------------------------------------------------------------*/
4814

    
4815
float128 float128_div( float128 a, float128 b STATUS_PARAM )
4816
{
4817
    flag aSign, bSign, zSign;
4818
    int32 aExp, bExp, zExp;
4819
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4820
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4821
    float128 z;
4822

    
4823
    aSig1 = extractFloat128Frac1( a );
4824
    aSig0 = extractFloat128Frac0( a );
4825
    aExp = extractFloat128Exp( a );
4826
    aSign = extractFloat128Sign( a );
4827
    bSig1 = extractFloat128Frac1( b );
4828
    bSig0 = extractFloat128Frac0( b );
4829
    bExp = extractFloat128Exp( b );
4830
    bSign = extractFloat128Sign( b );
4831
    zSign = aSign ^ bSign;
4832
    if ( aExp == 0x7FFF ) {
4833
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4834
        if ( bExp == 0x7FFF ) {
4835
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4836
            goto invalid;
4837
        }
4838
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4839
    }
4840
    if ( bExp == 0x7FFF ) {
4841
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4842
        return packFloat128( zSign, 0, 0, 0 );
4843
    }
4844
    if ( bExp == 0 ) {
4845
        if ( ( bSig0 | bSig1 ) == 0 ) {
4846
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4847
 invalid:
4848
                float_raise( float_flag_invalid STATUS_VAR);
4849
                z.low = float128_default_nan_low;
4850
                z.high = float128_default_nan_high;
4851
                return z;
4852
            }
4853
            float_raise( float_flag_divbyzero STATUS_VAR);
4854
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4855
        }
4856
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4857
    }
4858
    if ( aExp == 0 ) {
4859
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4860
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4861
    }
4862
    zExp = aExp - bExp + 0x3FFD;
4863
    shortShift128Left(
4864
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4865
    shortShift128Left(
4866
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4867
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4868
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4869
        ++zExp;
4870
    }
4871
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4872
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4873
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4874
    while ( (sbits64) rem0 < 0 ) {
4875
        --zSig0;
4876
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4877
    }
4878
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4879
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4880
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4881
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4882
        while ( (sbits64) rem1 < 0 ) {
4883
            --zSig1;
4884
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4885
        }
4886
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4887
    }
4888
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4889
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4890

    
4891
}
4892

    
4893
/*----------------------------------------------------------------------------
4894
| Returns the remainder of the quadruple-precision floating-point value `a'
4895
| with respect to the corresponding value `b'.  The operation is performed
4896
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4897
*----------------------------------------------------------------------------*/
4898

    
4899
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4900
{
4901
    flag aSign, bSign, zSign;
4902
    int32 aExp, bExp, expDiff;
4903
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4904
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4905
    sbits64 sigMean0;
4906
    float128 z;
4907

    
4908
    aSig1 = extractFloat128Frac1( a );
4909
    aSig0 = extractFloat128Frac0( a );
4910
    aExp = extractFloat128Exp( a );
4911
    aSign = extractFloat128Sign( a );
4912
    bSig1 = extractFloat128Frac1( b );
4913
    bSig0 = extractFloat128Frac0( b );
4914
    bExp = extractFloat128Exp( b );
4915
    bSign = extractFloat128Sign( b );
4916
    if ( aExp == 0x7FFF ) {
4917
        if (    ( aSig0 | aSig1 )
4918
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4919
            return propagateFloat128NaN( a, b STATUS_VAR );
4920
        }
4921
        goto invalid;
4922
    }
4923
    if ( bExp == 0x7FFF ) {
4924
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4925
        return a;
4926
    }
4927
    if ( bExp == 0 ) {
4928
        if ( ( bSig0 | bSig1 ) == 0 ) {
4929
 invalid:
4930
            float_raise( float_flag_invalid STATUS_VAR);
4931
            z.low = float128_default_nan_low;
4932
            z.high = float128_default_nan_high;
4933
            return z;
4934
        }
4935
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4936
    }
4937
    if ( aExp == 0 ) {
4938
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4939
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4940
    }
4941
    expDiff = aExp - bExp;
4942
    if ( expDiff < -1 ) return a;
4943
    shortShift128Left(
4944
        aSig0 | LIT64( 0x0001000000000000 ),
4945
        aSig1,
4946
        15 - ( expDiff < 0 ),
4947
        &aSig0,
4948
        &aSig1
4949
    );
4950
    shortShift128Left(
4951
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4952
    q = le128( bSig0, bSig1, aSig0, aSig1 );
4953
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4954
    expDiff -= 64;
4955
    while ( 0 < expDiff ) {
4956
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4957
        q = ( 4 < q ) ? q - 4 : 0;
4958
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4959
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4960
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4961
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4962
        expDiff -= 61;
4963
    }
4964
    if ( -64 < expDiff ) {
4965
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4966
        q = ( 4 < q ) ? q - 4 : 0;
4967
        q >>= - expDiff;
4968
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4969
        expDiff += 52;
4970
        if ( expDiff < 0 ) {
4971
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4972
        }
4973
        else {
4974
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4975
        }
4976
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4977
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4978
    }
4979
    else {
4980
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4981
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4982
    }
4983
    do {
4984
        alternateASig0 = aSig0;
4985
        alternateASig1 = aSig1;
4986
        ++q;
4987
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4988
    } while ( 0 <= (sbits64) aSig0 );
4989
    add128(
4990
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4991
    if (    ( sigMean0 < 0 )
4992
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4993
        aSig0 = alternateASig0;
4994
        aSig1 = alternateASig1;
4995
    }
4996
    zSign = ( (sbits64) aSig0 < 0 );
4997
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4998
    return
4999
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5000

    
5001
}
5002

    
5003
/*----------------------------------------------------------------------------
5004
| Returns the square root of the quadruple-precision floating-point value `a'.
5005
| The operation is performed according to the IEC/IEEE Standard for Binary
5006
| Floating-Point Arithmetic.
5007
*----------------------------------------------------------------------------*/
5008

    
5009
float128 float128_sqrt( float128 a STATUS_PARAM )
5010
{
5011
    flag aSign;
5012
    int32 aExp, zExp;
5013
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5014
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5015
    float128 z;
5016

    
5017
    aSig1 = extractFloat128Frac1( a );
5018
    aSig0 = extractFloat128Frac0( a );
5019
    aExp = extractFloat128Exp( a );
5020
    aSign = extractFloat128Sign( a );
5021
    if ( aExp == 0x7FFF ) {
5022
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5023
        if ( ! aSign ) return a;
5024
        goto invalid;
5025
    }
5026
    if ( aSign ) {
5027
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5028
 invalid:
5029
        float_raise( float_flag_invalid STATUS_VAR);
5030
        z.low = float128_default_nan_low;
5031
        z.high = float128_default_nan_high;
5032
        return z;
5033
    }
5034
    if ( aExp == 0 ) {
5035
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5036
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5037
    }
5038
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5039
    aSig0 |= LIT64( 0x0001000000000000 );
5040
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5041
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5042
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5043
    doubleZSig0 = zSig0<<1;
5044
    mul64To128( zSig0, zSig0, &term0, &term1 );
5045
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5046
    while ( (sbits64) rem0 < 0 ) {
5047
        --zSig0;
5048
        doubleZSig0 -= 2;
5049
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5050
    }
5051
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5052
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5053
        if ( zSig1 == 0 ) zSig1 = 1;
5054
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5055
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5056
        mul64To128( zSig1, zSig1, &term2, &term3 );
5057
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5058
        while ( (sbits64) rem1 < 0 ) {
5059
            --zSig1;
5060
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5061
            term3 |= 1;
5062
            term2 |= doubleZSig0;
5063
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5064
        }
5065
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5066
    }
5067
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5068
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5069

    
5070
}
5071

    
5072
/*----------------------------------------------------------------------------
5073
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5074
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5075
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076
*----------------------------------------------------------------------------*/
5077

    
5078
int float128_eq( float128 a, float128 b STATUS_PARAM )
5079
{
5080

    
5081
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5082
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5083
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5084
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5085
       ) {
5086
        if (    float128_is_signaling_nan( a )
5087
             || float128_is_signaling_nan( b ) ) {
5088
            float_raise( float_flag_invalid STATUS_VAR);
5089
        }
5090
        return 0;
5091
    }
5092
    return
5093
           ( a.low == b.low )
5094
        && (    ( a.high == b.high )
5095
             || (    ( a.low == 0 )
5096
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5097
           );
5098

    
5099
}
5100

    
5101
/*----------------------------------------------------------------------------
5102
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5103
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5104
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5105
| Arithmetic.
5106
*----------------------------------------------------------------------------*/
5107

    
5108
int float128_le( float128 a, float128 b STATUS_PARAM )
5109
{
5110
    flag aSign, bSign;
5111

    
5112
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5113
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5114
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5115
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5116
       ) {
5117
        float_raise( float_flag_invalid STATUS_VAR);
5118
        return 0;
5119
    }
5120
    aSign = extractFloat128Sign( a );
5121
    bSign = extractFloat128Sign( b );
5122
    if ( aSign != bSign ) {
5123
        return
5124
               aSign
5125
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5126
                 == 0 );
5127
    }
5128
    return
5129
          aSign ? le128( b.high, b.low, a.high, a.low )
5130
        : le128( a.high, a.low, b.high, b.low );
5131

    
5132
}
5133

    
5134
/*----------------------------------------------------------------------------
5135
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5136
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5137
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5138
*----------------------------------------------------------------------------*/
5139

    
5140
int float128_lt( float128 a, float128 b STATUS_PARAM )
5141
{
5142
    flag aSign, bSign;
5143

    
5144
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5145
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5146
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5147
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5148
       ) {
5149
        float_raise( float_flag_invalid STATUS_VAR);
5150
        return 0;
5151
    }
5152
    aSign = extractFloat128Sign( a );
5153
    bSign = extractFloat128Sign( b );
5154
    if ( aSign != bSign ) {
5155
        return
5156
               aSign
5157
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5158
                 != 0 );
5159
    }
5160
    return
5161
          aSign ? lt128( b.high, b.low, a.high, a.low )
5162
        : lt128( a.high, a.low, b.high, b.low );
5163

    
5164
}
5165

    
5166
/*----------------------------------------------------------------------------
5167
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5168
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5169
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5170
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171
*----------------------------------------------------------------------------*/
5172

    
5173
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5174
{
5175

    
5176
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5177
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5178
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5179
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5180
       ) {
5181
        float_raise( float_flag_invalid STATUS_VAR);
5182
        return 0;
5183
    }
5184
    return
5185
           ( a.low == b.low )
5186
        && (    ( a.high == b.high )
5187
             || (    ( a.low == 0 )
5188
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5189
           );
5190

    
5191
}
5192

    
5193
/*----------------------------------------------------------------------------
5194
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5195
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5196
| cause an exception.  Otherwise, the comparison is performed according to the
5197
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5198
*----------------------------------------------------------------------------*/
5199

    
5200
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5201
{
5202
    flag aSign, bSign;
5203

    
5204
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5205
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5206
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5207
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5208
       ) {
5209
        if (    float128_is_signaling_nan( a )
5210
             || float128_is_signaling_nan( b ) ) {
5211
            float_raise( float_flag_invalid STATUS_VAR);
5212
        }
5213
        return 0;
5214
    }
5215
    aSign = extractFloat128Sign( a );
5216
    bSign = extractFloat128Sign( b );
5217
    if ( aSign != bSign ) {
5218
        return
5219
               aSign
5220
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5221
                 == 0 );
5222
    }
5223
    return
5224
          aSign ? le128( b.high, b.low, a.high, a.low )
5225
        : le128( a.high, a.low, b.high, b.low );
5226

    
5227
}
5228

    
5229
/*----------------------------------------------------------------------------
5230
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5231
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5232
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5233
| Standard for Binary Floating-Point Arithmetic.
5234
*----------------------------------------------------------------------------*/
5235

    
5236
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5237
{
5238
    flag aSign, bSign;
5239

    
5240
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5241
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5242
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5243
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5244
       ) {
5245
        if (    float128_is_signaling_nan( a )
5246
             || float128_is_signaling_nan( b ) ) {
5247
            float_raise( float_flag_invalid STATUS_VAR);
5248
        }
5249
        return 0;
5250
    }
5251
    aSign = extractFloat128Sign( a );
5252
    bSign = extractFloat128Sign( b );
5253
    if ( aSign != bSign ) {
5254
        return
5255
               aSign
5256
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5257
                 != 0 );
5258
    }
5259
    return
5260
          aSign ? lt128( b.high, b.low, a.high, a.low )
5261
        : lt128( a.high, a.low, b.high, b.low );
5262

    
5263
}
5264

    
5265
#endif
5266

    
5267
/* misc functions */
5268
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5269
{
5270
    return int64_to_float32(a STATUS_VAR);
5271
}
5272

    
5273
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5274
{
5275
    return int64_to_float64(a STATUS_VAR);
5276
}
5277

    
5278
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5279
{
5280
    int64_t v;
5281
    unsigned int res;
5282

    
5283
    v = float32_to_int64(a STATUS_VAR);
5284
    if (v < 0) {
5285
        res = 0;
5286
        float_raise( float_flag_invalid STATUS_VAR);
5287
    } else if (v > 0xffffffff) {
5288
        res = 0xffffffff;
5289
        float_raise( float_flag_invalid STATUS_VAR);
5290
    } else {
5291
        res = v;
5292
    }
5293
    return res;
5294
}
5295

    
5296
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5297
{
5298
    int64_t v;
5299
    unsigned int res;
5300

    
5301
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5302
    if (v < 0) {
5303
        res = 0;
5304
        float_raise( float_flag_invalid STATUS_VAR);
5305
    } else if (v > 0xffffffff) {
5306
        res = 0xffffffff;
5307
        float_raise( float_flag_invalid STATUS_VAR);
5308
    } else {
5309
        res = v;
5310
    }
5311
    return res;
5312
}
5313

    
5314
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5315
{
5316
    int64_t v;
5317
    unsigned int res;
5318

    
5319
    v = float64_to_int64(a STATUS_VAR);
5320
    if (v < 0) {
5321
        res = 0;
5322
        float_raise( float_flag_invalid STATUS_VAR);
5323
    } else if (v > 0xffffffff) {
5324
        res = 0xffffffff;
5325
        float_raise( float_flag_invalid STATUS_VAR);
5326
    } else {
5327
        res = v;
5328
    }
5329
    return res;
5330
}
5331

    
5332
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5333
{
5334
    int64_t v;
5335
    unsigned int res;
5336

    
5337
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5338
    if (v < 0) {
5339
        res = 0;
5340
        float_raise( float_flag_invalid STATUS_VAR);
5341
    } else if (v > 0xffffffff) {
5342
        res = 0xffffffff;
5343
        float_raise( float_flag_invalid STATUS_VAR);
5344
    } else {
5345
        res = v;
5346
    }
5347
    return res;
5348
}
5349

    
5350
/* FIXME: This looks broken.  */
5351
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5352
{
5353
    int64_t v;
5354

    
5355
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5356
    v += float64_val(a);
5357
    v = float64_to_int64(make_float64(v) STATUS_VAR);
5358

    
5359
    return v - INT64_MIN;
5360
}
5361

    
5362
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5363
{
5364
    int64_t v;
5365

    
5366
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5367
    v += float64_val(a);
5368
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5369

    
5370
    return v - INT64_MIN;
5371
}
5372

    
5373
#define COMPARE(s, nan_exp)                                                  \
5374
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5375
                                      int is_quiet STATUS_PARAM )            \
5376
{                                                                            \
5377
    flag aSign, bSign;                                                       \
5378
    bits ## s av, bv;                                                        \
5379
                                                                             \
5380
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5381
         extractFloat ## s ## Frac( a ) ) ||                                 \
5382
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5383
          extractFloat ## s ## Frac( b ) )) {                                \
5384
        if (!is_quiet ||                                                     \
5385
            float ## s ## _is_signaling_nan( a ) ||                          \
5386
            float ## s ## _is_signaling_nan( b ) ) {                         \
5387
            float_raise( float_flag_invalid STATUS_VAR);                     \
5388
        }                                                                    \
5389
        return float_relation_unordered;                                     \
5390
    }                                                                        \
5391
    aSign = extractFloat ## s ## Sign( a );                                  \
5392
    bSign = extractFloat ## s ## Sign( b );                                  \
5393
    av = float ## s ## _val(a);                                              \
5394
    bv = float ## s ## _val(b);                                              \
5395
    if ( aSign != bSign ) {                                                  \
5396
        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5397
            /* zero case */                                                  \
5398
            return float_relation_equal;                                     \
5399
        } else {                                                             \
5400
            return 1 - (2 * aSign);                                          \
5401
        }                                                                    \
5402
    } else {                                                                 \
5403
        if (av == bv) {                                                      \
5404
            return float_relation_equal;                                     \
5405
        } else {                                                             \
5406
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5407
        }                                                                    \
5408
    }                                                                        \
5409
}                                                                            \
5410
                                                                             \
5411
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5412
{                                                                            \
5413
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5414
}                                                                            \
5415
                                                                             \
5416
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5417
{                                                                            \
5418
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5419
}
5420

    
5421
COMPARE(32, 0xff)
5422
COMPARE(64, 0x7ff)
5423

    
5424
INLINE int float128_compare_internal( float128 a, float128 b,
5425
                                      int is_quiet STATUS_PARAM )
5426
{
5427
    flag aSign, bSign;
5428

    
5429
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5430
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5431
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5432
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5433
        if (!is_quiet ||
5434
            float128_is_signaling_nan( a ) ||
5435
            float128_is_signaling_nan( b ) ) {
5436
            float_raise( float_flag_invalid STATUS_VAR);
5437
        }
5438
        return float_relation_unordered;
5439
    }
5440
    aSign = extractFloat128Sign( a );
5441
    bSign = extractFloat128Sign( b );
5442
    if ( aSign != bSign ) {
5443
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5444
            /* zero case */
5445
            return float_relation_equal;
5446
        } else {
5447
            return 1 - (2 * aSign);
5448
        }
5449
    } else {
5450
        if (a.low == b.low && a.high == b.high) {
5451
            return float_relation_equal;
5452
        } else {
5453
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5454
        }
5455
    }
5456
}
5457

    
5458
int float128_compare( float128 a, float128 b STATUS_PARAM )
5459
{
5460
    return float128_compare_internal(a, b, 0 STATUS_VAR);
5461
}
5462

    
5463
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5464
{
5465
    return float128_compare_internal(a, b, 1 STATUS_VAR);
5466
}
5467

    
5468
/* Multiply A by 2 raised to the power N.  */
5469
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5470
{
5471
    flag aSign;
5472
    int16 aExp;
5473
    bits32 aSig;
5474

    
5475
    aSig = extractFloat32Frac( a );
5476
    aExp = extractFloat32Exp( a );
5477
    aSign = extractFloat32Sign( a );
5478

    
5479
    if ( aExp == 0xFF ) {
5480
        return a;
5481
    }
5482
    aExp += n;
5483
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5484
}
5485

    
5486
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5487
{
5488
    flag aSign;
5489
    int16 aExp;
5490
    bits64 aSig;
5491

    
5492
    aSig = extractFloat64Frac( a );
5493
    aExp = extractFloat64Exp( a );
5494
    aSign = extractFloat64Sign( a );
5495

    
5496
    if ( aExp == 0x7FF ) {
5497
        return a;
5498
    }
5499
    aExp += n;
5500
    return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5501
}
5502

    
5503
#ifdef FLOATX80
5504
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5505
{
5506
    flag aSign;
5507
    int16 aExp;
5508
    bits64 aSig;
5509

    
5510
    aSig = extractFloatx80Frac( a );
5511
    aExp = extractFloatx80Exp( a );
5512
    aSign = extractFloatx80Sign( a );
5513

    
5514
    if ( aExp == 0x7FF ) {
5515
        return a;
5516
    }
5517
    aExp += n;
5518
    return roundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5519
                                 aSign, aExp, aSig, 0 STATUS_VAR );
5520
}
5521
#endif
5522

    
5523
#ifdef FLOAT128
5524
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5525
{
5526
    flag aSign;
5527
    int32 aExp;
5528
    bits64 aSig0, aSig1;
5529

    
5530
    aSig1 = extractFloat128Frac1( a );
5531
    aSig0 = extractFloat128Frac0( a );
5532
    aExp = extractFloat128Exp( a );
5533
    aSign = extractFloat128Sign( a );
5534
    if ( aExp == 0x7FFF ) {
5535
        return a;
5536
    }
5537
    aExp += n;
5538
    return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR );
5539

    
5540
}
5541
#endif