Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ fd728f2f

History | View | Annotate | Download (253.8 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
 * target-dependent and needs the TARGET_* macros.
40
 */
41
#include "config.h"
42

    
43
#include "fpu/softfloat.h"
44

    
45
/*----------------------------------------------------------------------------
46
| Primitive arithmetic functions, including multi-word arithmetic, and
47
| division and square root approximations.  (Can be specialized to target if
48
| desired.)
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-macros.h"
51

    
52
/*----------------------------------------------------------------------------
53
| Functions and definitions to determine:  (1) whether tininess for underflow
54
| is detected before or after rounding by default, (2) what (if anything)
55
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57
| are propagated from function inputs to output.  These details are target-
58
| specific.
59
*----------------------------------------------------------------------------*/
60
#include "softfloat-specialize.h"
61

    
62
void set_float_rounding_mode(int val STATUS_PARAM)
63
{
64
    STATUS(float_rounding_mode) = val;
65
}
66

    
67
void set_float_exception_flags(int val STATUS_PARAM)
68
{
69
    STATUS(float_exception_flags) = val;
70
}
71

    
72
void set_floatx80_rounding_precision(int val STATUS_PARAM)
73
{
74
    STATUS(floatx80_rounding_precision) = val;
75
}
76

    
77
/*----------------------------------------------------------------------------
78
| Returns the fraction bits of the half-precision floating-point value `a'.
79
*----------------------------------------------------------------------------*/
80

    
81
INLINE uint32_t extractFloat16Frac(float16 a)
82
{
83
    return float16_val(a) & 0x3ff;
84
}
85

    
86
/*----------------------------------------------------------------------------
87
| Returns the exponent bits of the half-precision floating-point value `a'.
88
*----------------------------------------------------------------------------*/
89

    
90
INLINE int_fast16_t extractFloat16Exp(float16 a)
91
{
92
    return (float16_val(a) >> 10) & 0x1f;
93
}
94

    
95
/*----------------------------------------------------------------------------
96
| Returns the sign bit of the single-precision floating-point value `a'.
97
*----------------------------------------------------------------------------*/
98

    
99
INLINE flag extractFloat16Sign(float16 a)
100
{
101
    return float16_val(a)>>15;
102
}
103

    
104
/*----------------------------------------------------------------------------
105
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106
| and 7, and returns the properly rounded 32-bit integer corresponding to the
107
| input.  If `zSign' is 1, the input is negated before being converted to an
108
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
109
| is simply rounded to an integer, with the inexact exception raised if the
110
| input cannot be represented exactly as an integer.  However, if the fixed-
111
| point input is too large, the invalid exception is raised and the largest
112
| positive or negative integer is returned.
113
*----------------------------------------------------------------------------*/
114

    
115
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
116
{
117
    int8 roundingMode;
118
    flag roundNearestEven;
119
    int8 roundIncrement, roundBits;
120
    int32_t z;
121

    
122
    roundingMode = STATUS(float_rounding_mode);
123
    roundNearestEven = ( roundingMode == float_round_nearest_even );
124
    roundIncrement = 0x40;
125
    if ( ! roundNearestEven ) {
126
        if ( roundingMode == float_round_to_zero ) {
127
            roundIncrement = 0;
128
        }
129
        else {
130
            roundIncrement = 0x7F;
131
            if ( zSign ) {
132
                if ( roundingMode == float_round_up ) roundIncrement = 0;
133
            }
134
            else {
135
                if ( roundingMode == float_round_down ) roundIncrement = 0;
136
            }
137
        }
138
    }
139
    roundBits = absZ & 0x7F;
140
    absZ = ( absZ + roundIncrement )>>7;
141
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142
    z = absZ;
143
    if ( zSign ) z = - z;
144
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145
        float_raise( float_flag_invalid STATUS_VAR);
146
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
147
    }
148
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149
    return z;
150

    
151
}
152

    
153
/*----------------------------------------------------------------------------
154
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155
| `absZ1', with binary point between bits 63 and 64 (between the input words),
156
| and returns the properly rounded 64-bit integer corresponding to the input.
157
| If `zSign' is 1, the input is negated before being converted to an integer.
158
| Ordinarily, the fixed-point input is simply rounded to an integer, with
159
| the inexact exception raised if the input cannot be represented exactly as
160
| an integer.  However, if the fixed-point input is too large, the invalid
161
| exception is raised and the largest positive or negative integer is
162
| returned.
163
*----------------------------------------------------------------------------*/
164

    
165
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
166
{
167
    int8 roundingMode;
168
    flag roundNearestEven, increment;
169
    int64_t z;
170

    
171
    roundingMode = STATUS(float_rounding_mode);
172
    roundNearestEven = ( roundingMode == float_round_nearest_even );
173
    increment = ( (int64_t) absZ1 < 0 );
174
    if ( ! roundNearestEven ) {
175
        if ( roundingMode == float_round_to_zero ) {
176
            increment = 0;
177
        }
178
        else {
179
            if ( zSign ) {
180
                increment = ( roundingMode == float_round_down ) && absZ1;
181
            }
182
            else {
183
                increment = ( roundingMode == float_round_up ) && absZ1;
184
            }
185
        }
186
    }
187
    if ( increment ) {
188
        ++absZ0;
189
        if ( absZ0 == 0 ) goto overflow;
190
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191
    }
192
    z = absZ0;
193
    if ( zSign ) z = - z;
194
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
195
 overflow:
196
        float_raise( float_flag_invalid STATUS_VAR);
197
        return
198
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
199
            : LIT64( 0x7FFFFFFFFFFFFFFF );
200
    }
201
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202
    return z;
203

    
204
}
205

    
206
/*----------------------------------------------------------------------------
207
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
208
| `absZ1', with binary point between bits 63 and 64 (between the input words),
209
| and returns the properly rounded 64-bit unsigned integer corresponding to the
210
| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
211
| with the inexact exception raised if the input cannot be represented exactly
212
| as an integer.  However, if the fixed-point input is too large, the invalid
213
| exception is raised and the largest unsigned integer is returned.
214
*----------------------------------------------------------------------------*/
215

    
216
static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
217
                                uint64_t absZ1 STATUS_PARAM)
218
{
219
    int8 roundingMode;
220
    flag roundNearestEven, increment;
221

    
222
    roundingMode = STATUS(float_rounding_mode);
223
    roundNearestEven = (roundingMode == float_round_nearest_even);
224
    increment = ((int64_t)absZ1 < 0);
225
    if (!roundNearestEven) {
226
        if (roundingMode == float_round_to_zero) {
227
            increment = 0;
228
        } else if (absZ1) {
229
            if (zSign) {
230
                increment = (roundingMode == float_round_down) && absZ1;
231
            } else {
232
                increment = (roundingMode == float_round_up) && absZ1;
233
            }
234
        }
235
    }
236
    if (increment) {
237
        ++absZ0;
238
        if (absZ0 == 0) {
239
            float_raise(float_flag_invalid STATUS_VAR);
240
            return LIT64(0xFFFFFFFFFFFFFFFF);
241
        }
242
        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
243
    }
244

    
245
    if (zSign && absZ0) {
246
        float_raise(float_flag_invalid STATUS_VAR);
247
        return 0;
248
    }
249

    
250
    if (absZ1) {
251
        STATUS(float_exception_flags) |= float_flag_inexact;
252
    }
253
    return absZ0;
254
}
255

    
256
/*----------------------------------------------------------------------------
257
| Returns the fraction bits of the single-precision floating-point value `a'.
258
*----------------------------------------------------------------------------*/
259

    
260
INLINE uint32_t extractFloat32Frac( float32 a )
261
{
262

    
263
    return float32_val(a) & 0x007FFFFF;
264

    
265
}
266

    
267
/*----------------------------------------------------------------------------
268
| Returns the exponent bits of the single-precision floating-point value `a'.
269
*----------------------------------------------------------------------------*/
270

    
271
INLINE int_fast16_t extractFloat32Exp(float32 a)
272
{
273

    
274
    return ( float32_val(a)>>23 ) & 0xFF;
275

    
276
}
277

    
278
/*----------------------------------------------------------------------------
279
| Returns the sign bit of the single-precision floating-point value `a'.
280
*----------------------------------------------------------------------------*/
281

    
282
INLINE flag extractFloat32Sign( float32 a )
283
{
284

    
285
    return float32_val(a)>>31;
286

    
287
}
288

    
289
/*----------------------------------------------------------------------------
290
| If `a' is denormal and we are in flush-to-zero mode then set the
291
| input-denormal exception and return zero. Otherwise just return the value.
292
*----------------------------------------------------------------------------*/
293
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
294
{
295
    if (STATUS(flush_inputs_to_zero)) {
296
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
297
            float_raise(float_flag_input_denormal STATUS_VAR);
298
            return make_float32(float32_val(a) & 0x80000000);
299
        }
300
    }
301
    return a;
302
}
303

    
304
/*----------------------------------------------------------------------------
305
| Normalizes the subnormal single-precision floating-point value represented
306
| by the denormalized significand `aSig'.  The normalized exponent and
307
| significand are stored at the locations pointed to by `zExpPtr' and
308
| `zSigPtr', respectively.
309
*----------------------------------------------------------------------------*/
310

    
311
static void
312
 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
313
{
314
    int8 shiftCount;
315

    
316
    shiftCount = countLeadingZeros32( aSig ) - 8;
317
    *zSigPtr = aSig<<shiftCount;
318
    *zExpPtr = 1 - shiftCount;
319

    
320
}
321

    
322
/*----------------------------------------------------------------------------
323
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
324
| single-precision floating-point value, returning the result.  After being
325
| shifted into the proper positions, the three fields are simply added
326
| together to form the result.  This means that any integer portion of `zSig'
327
| will be added into the exponent.  Since a properly normalized significand
328
| will have an integer portion equal to 1, the `zExp' input should be 1 less
329
| than the desired result exponent whenever `zSig' is a complete, normalized
330
| significand.
331
*----------------------------------------------------------------------------*/
332

    
333
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
334
{
335

    
336
    return make_float32(
337
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
338

    
339
}
340

    
341
/*----------------------------------------------------------------------------
342
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
343
| and significand `zSig', and returns the proper single-precision floating-
344
| point value corresponding to the abstract input.  Ordinarily, the abstract
345
| value is simply rounded and packed into the single-precision format, with
346
| the inexact exception raised if the abstract input cannot be represented
347
| exactly.  However, if the abstract value is too large, the overflow and
348
| inexact exceptions are raised and an infinity or maximal finite value is
349
| returned.  If the abstract value is too small, the input value is rounded to
350
| a subnormal number, and the underflow and inexact exceptions are raised if
351
| the abstract input cannot be represented exactly as a subnormal single-
352
| precision floating-point number.
353
|     The input significand `zSig' has its binary point between bits 30
354
| and 29, which is 7 bits to the left of the usual location.  This shifted
355
| significand must be normalized or smaller.  If `zSig' is not normalized,
356
| `zExp' must be 0; in that case, the result returned is a subnormal number,
357
| and it must not require rounding.  In the usual case that `zSig' is
358
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
359
| The handling of underflow and overflow follows the IEC/IEEE Standard for
360
| Binary Floating-Point Arithmetic.
361
*----------------------------------------------------------------------------*/
362

    
363
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
364
{
365
    int8 roundingMode;
366
    flag roundNearestEven;
367
    int8 roundIncrement, roundBits;
368
    flag isTiny;
369

    
370
    roundingMode = STATUS(float_rounding_mode);
371
    roundNearestEven = ( roundingMode == float_round_nearest_even );
372
    roundIncrement = 0x40;
373
    if ( ! roundNearestEven ) {
374
        if ( roundingMode == float_round_to_zero ) {
375
            roundIncrement = 0;
376
        }
377
        else {
378
            roundIncrement = 0x7F;
379
            if ( zSign ) {
380
                if ( roundingMode == float_round_up ) roundIncrement = 0;
381
            }
382
            else {
383
                if ( roundingMode == float_round_down ) roundIncrement = 0;
384
            }
385
        }
386
    }
387
    roundBits = zSig & 0x7F;
388
    if ( 0xFD <= (uint16_t) zExp ) {
389
        if (    ( 0xFD < zExp )
390
             || (    ( zExp == 0xFD )
391
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
392
           ) {
393
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
394
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
395
        }
396
        if ( zExp < 0 ) {
397
            if (STATUS(flush_to_zero)) {
398
                float_raise(float_flag_output_denormal STATUS_VAR);
399
                return packFloat32(zSign, 0, 0);
400
            }
401
            isTiny =
402
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
403
                || ( zExp < -1 )
404
                || ( zSig + roundIncrement < 0x80000000 );
405
            shift32RightJamming( zSig, - zExp, &zSig );
406
            zExp = 0;
407
            roundBits = zSig & 0x7F;
408
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
409
        }
410
    }
411
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
412
    zSig = ( zSig + roundIncrement )>>7;
413
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
414
    if ( zSig == 0 ) zExp = 0;
415
    return packFloat32( zSign, zExp, zSig );
416

    
417
}
418

    
419
/*----------------------------------------------------------------------------
420
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
421
| and significand `zSig', and returns the proper single-precision floating-
422
| point value corresponding to the abstract input.  This routine is just like
423
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
424
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
425
| floating-point exponent.
426
*----------------------------------------------------------------------------*/
427

    
428
static float32
429
 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
430
{
431
    int8 shiftCount;
432

    
433
    shiftCount = countLeadingZeros32( zSig ) - 1;
434
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
435

    
436
}
437

    
438
/*----------------------------------------------------------------------------
439
| Returns the fraction bits of the double-precision floating-point value `a'.
440
*----------------------------------------------------------------------------*/
441

    
442
INLINE uint64_t extractFloat64Frac( float64 a )
443
{
444

    
445
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
446

    
447
}
448

    
449
/*----------------------------------------------------------------------------
450
| Returns the exponent bits of the double-precision floating-point value `a'.
451
*----------------------------------------------------------------------------*/
452

    
453
INLINE int_fast16_t extractFloat64Exp(float64 a)
454
{
455

    
456
    return ( float64_val(a)>>52 ) & 0x7FF;
457

    
458
}
459

    
460
/*----------------------------------------------------------------------------
461
| Returns the sign bit of the double-precision floating-point value `a'.
462
*----------------------------------------------------------------------------*/
463

    
464
INLINE flag extractFloat64Sign( float64 a )
465
{
466

    
467
    return float64_val(a)>>63;
468

    
469
}
470

    
471
/*----------------------------------------------------------------------------
472
| If `a' is denormal and we are in flush-to-zero mode then set the
473
| input-denormal exception and return zero. Otherwise just return the value.
474
*----------------------------------------------------------------------------*/
475
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
476
{
477
    if (STATUS(flush_inputs_to_zero)) {
478
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
479
            float_raise(float_flag_input_denormal STATUS_VAR);
480
            return make_float64(float64_val(a) & (1ULL << 63));
481
        }
482
    }
483
    return a;
484
}
485

    
486
/*----------------------------------------------------------------------------
487
| Normalizes the subnormal double-precision floating-point value represented
488
| by the denormalized significand `aSig'.  The normalized exponent and
489
| significand are stored at the locations pointed to by `zExpPtr' and
490
| `zSigPtr', respectively.
491
*----------------------------------------------------------------------------*/
492

    
493
static void
494
 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
495
{
496
    int8 shiftCount;
497

    
498
    shiftCount = countLeadingZeros64( aSig ) - 11;
499
    *zSigPtr = aSig<<shiftCount;
500
    *zExpPtr = 1 - shiftCount;
501

    
502
}
503

    
504
/*----------------------------------------------------------------------------
505
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
506
| double-precision floating-point value, returning the result.  After being
507
| shifted into the proper positions, the three fields are simply added
508
| together to form the result.  This means that any integer portion of `zSig'
509
| will be added into the exponent.  Since a properly normalized significand
510
| will have an integer portion equal to 1, the `zExp' input should be 1 less
511
| than the desired result exponent whenever `zSig' is a complete, normalized
512
| significand.
513
*----------------------------------------------------------------------------*/
514

    
515
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
516
{
517

    
518
    return make_float64(
519
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
520

    
521
}
522

    
523
/*----------------------------------------------------------------------------
524
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
525
| and significand `zSig', and returns the proper double-precision floating-
526
| point value corresponding to the abstract input.  Ordinarily, the abstract
527
| value is simply rounded and packed into the double-precision format, with
528
| the inexact exception raised if the abstract input cannot be represented
529
| exactly.  However, if the abstract value is too large, the overflow and
530
| inexact exceptions are raised and an infinity or maximal finite value is
531
| returned.  If the abstract value is too small, the input value is rounded
532
| to a subnormal number, and the underflow and inexact exceptions are raised
533
| if the abstract input cannot be represented exactly as a subnormal double-
534
| precision floating-point number.
535
|     The input significand `zSig' has its binary point between bits 62
536
| and 61, which is 10 bits to the left of the usual location.  This shifted
537
| significand must be normalized or smaller.  If `zSig' is not normalized,
538
| `zExp' must be 0; in that case, the result returned is a subnormal number,
539
| and it must not require rounding.  In the usual case that `zSig' is
540
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
541
| The handling of underflow and overflow follows the IEC/IEEE Standard for
542
| Binary Floating-Point Arithmetic.
543
*----------------------------------------------------------------------------*/
544

    
545
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
546
{
547
    int8 roundingMode;
548
    flag roundNearestEven;
549
    int_fast16_t roundIncrement, roundBits;
550
    flag isTiny;
551

    
552
    roundingMode = STATUS(float_rounding_mode);
553
    roundNearestEven = ( roundingMode == float_round_nearest_even );
554
    roundIncrement = 0x200;
555
    if ( ! roundNearestEven ) {
556
        if ( roundingMode == float_round_to_zero ) {
557
            roundIncrement = 0;
558
        }
559
        else {
560
            roundIncrement = 0x3FF;
561
            if ( zSign ) {
562
                if ( roundingMode == float_round_up ) roundIncrement = 0;
563
            }
564
            else {
565
                if ( roundingMode == float_round_down ) roundIncrement = 0;
566
            }
567
        }
568
    }
569
    roundBits = zSig & 0x3FF;
570
    if ( 0x7FD <= (uint16_t) zExp ) {
571
        if (    ( 0x7FD < zExp )
572
             || (    ( zExp == 0x7FD )
573
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
574
           ) {
575
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
576
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
577
        }
578
        if ( zExp < 0 ) {
579
            if (STATUS(flush_to_zero)) {
580
                float_raise(float_flag_output_denormal STATUS_VAR);
581
                return packFloat64(zSign, 0, 0);
582
            }
583
            isTiny =
584
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
585
                || ( zExp < -1 )
586
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
587
            shift64RightJamming( zSig, - zExp, &zSig );
588
            zExp = 0;
589
            roundBits = zSig & 0x3FF;
590
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
591
        }
592
    }
593
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
594
    zSig = ( zSig + roundIncrement )>>10;
595
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
596
    if ( zSig == 0 ) zExp = 0;
597
    return packFloat64( zSign, zExp, zSig );
598

    
599
}
600

    
601
/*----------------------------------------------------------------------------
602
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
603
| and significand `zSig', and returns the proper double-precision floating-
604
| point value corresponding to the abstract input.  This routine is just like
605
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
606
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
607
| floating-point exponent.
608
*----------------------------------------------------------------------------*/
609

    
610
static float64
611
 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
612
{
613
    int8 shiftCount;
614

    
615
    shiftCount = countLeadingZeros64( zSig ) - 1;
616
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
617

    
618
}
619

    
620
/*----------------------------------------------------------------------------
621
| Returns the fraction bits of the extended double-precision floating-point
622
| value `a'.
623
*----------------------------------------------------------------------------*/
624

    
625
INLINE uint64_t extractFloatx80Frac( floatx80 a )
626
{
627

    
628
    return a.low;
629

    
630
}
631

    
632
/*----------------------------------------------------------------------------
633
| Returns the exponent bits of the extended double-precision floating-point
634
| value `a'.
635
*----------------------------------------------------------------------------*/
636

    
637
INLINE int32 extractFloatx80Exp( floatx80 a )
638
{
639

    
640
    return a.high & 0x7FFF;
641

    
642
}
643

    
644
/*----------------------------------------------------------------------------
645
| Returns the sign bit of the extended double-precision floating-point value
646
| `a'.
647
*----------------------------------------------------------------------------*/
648

    
649
INLINE flag extractFloatx80Sign( floatx80 a )
650
{
651

    
652
    return a.high>>15;
653

    
654
}
655

    
656
/*----------------------------------------------------------------------------
657
| Normalizes the subnormal extended double-precision floating-point value
658
| represented by the denormalized significand `aSig'.  The normalized exponent
659
| and significand are stored at the locations pointed to by `zExpPtr' and
660
| `zSigPtr', respectively.
661
*----------------------------------------------------------------------------*/
662

    
663
static void
664
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
665
{
666
    int8 shiftCount;
667

    
668
    shiftCount = countLeadingZeros64( aSig );
669
    *zSigPtr = aSig<<shiftCount;
670
    *zExpPtr = 1 - shiftCount;
671

    
672
}
673

    
674
/*----------------------------------------------------------------------------
675
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
676
| extended double-precision floating-point value, returning the result.
677
*----------------------------------------------------------------------------*/
678

    
679
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
680
{
681
    floatx80 z;
682

    
683
    z.low = zSig;
684
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
685
    return z;
686

    
687
}
688

    
689
/*----------------------------------------------------------------------------
690
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
691
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
692
| and returns the proper extended double-precision floating-point value
693
| corresponding to the abstract input.  Ordinarily, the abstract value is
694
| rounded and packed into the extended double-precision format, with the
695
| inexact exception raised if the abstract input cannot be represented
696
| exactly.  However, if the abstract value is too large, the overflow and
697
| inexact exceptions are raised and an infinity or maximal finite value is
698
| returned.  If the abstract value is too small, the input value is rounded to
699
| a subnormal number, and the underflow and inexact exceptions are raised if
700
| the abstract input cannot be represented exactly as a subnormal extended
701
| double-precision floating-point number.
702
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
703
| number of bits as single or double precision, respectively.  Otherwise, the
704
| result is rounded to the full precision of the extended double-precision
705
| format.
706
|     The input significand must be normalized or smaller.  If the input
707
| significand is not normalized, `zExp' must be 0; in that case, the result
708
| returned is a subnormal number, and it must not require rounding.  The
709
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
710
| Floating-Point Arithmetic.
711
*----------------------------------------------------------------------------*/
712

    
713
static floatx80
714
 roundAndPackFloatx80(
715
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
716
 STATUS_PARAM)
717
{
718
    int8 roundingMode;
719
    flag roundNearestEven, increment, isTiny;
720
    int64 roundIncrement, roundMask, roundBits;
721

    
722
    roundingMode = STATUS(float_rounding_mode);
723
    roundNearestEven = ( roundingMode == float_round_nearest_even );
724
    if ( roundingPrecision == 80 ) goto precision80;
725
    if ( roundingPrecision == 64 ) {
726
        roundIncrement = LIT64( 0x0000000000000400 );
727
        roundMask = LIT64( 0x00000000000007FF );
728
    }
729
    else if ( roundingPrecision == 32 ) {
730
        roundIncrement = LIT64( 0x0000008000000000 );
731
        roundMask = LIT64( 0x000000FFFFFFFFFF );
732
    }
733
    else {
734
        goto precision80;
735
    }
736
    zSig0 |= ( zSig1 != 0 );
737
    if ( ! roundNearestEven ) {
738
        if ( roundingMode == float_round_to_zero ) {
739
            roundIncrement = 0;
740
        }
741
        else {
742
            roundIncrement = roundMask;
743
            if ( zSign ) {
744
                if ( roundingMode == float_round_up ) roundIncrement = 0;
745
            }
746
            else {
747
                if ( roundingMode == float_round_down ) roundIncrement = 0;
748
            }
749
        }
750
    }
751
    roundBits = zSig0 & roundMask;
752
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
753
        if (    ( 0x7FFE < zExp )
754
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
755
           ) {
756
            goto overflow;
757
        }
758
        if ( zExp <= 0 ) {
759
            if (STATUS(flush_to_zero)) {
760
                float_raise(float_flag_output_denormal STATUS_VAR);
761
                return packFloatx80(zSign, 0, 0);
762
            }
763
            isTiny =
764
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
765
                || ( zExp < 0 )
766
                || ( zSig0 <= zSig0 + roundIncrement );
767
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
768
            zExp = 0;
769
            roundBits = zSig0 & roundMask;
770
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
771
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
772
            zSig0 += roundIncrement;
773
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
774
            roundIncrement = roundMask + 1;
775
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
776
                roundMask |= roundIncrement;
777
            }
778
            zSig0 &= ~ roundMask;
779
            return packFloatx80( zSign, zExp, zSig0 );
780
        }
781
    }
782
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
783
    zSig0 += roundIncrement;
784
    if ( zSig0 < roundIncrement ) {
785
        ++zExp;
786
        zSig0 = LIT64( 0x8000000000000000 );
787
    }
788
    roundIncrement = roundMask + 1;
789
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
790
        roundMask |= roundIncrement;
791
    }
792
    zSig0 &= ~ roundMask;
793
    if ( zSig0 == 0 ) zExp = 0;
794
    return packFloatx80( zSign, zExp, zSig0 );
795
 precision80:
796
    increment = ( (int64_t) zSig1 < 0 );
797
    if ( ! roundNearestEven ) {
798
        if ( roundingMode == float_round_to_zero ) {
799
            increment = 0;
800
        }
801
        else {
802
            if ( zSign ) {
803
                increment = ( roundingMode == float_round_down ) && zSig1;
804
            }
805
            else {
806
                increment = ( roundingMode == float_round_up ) && zSig1;
807
            }
808
        }
809
    }
810
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
811
        if (    ( 0x7FFE < zExp )
812
             || (    ( zExp == 0x7FFE )
813
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
814
                  && increment
815
                )
816
           ) {
817
            roundMask = 0;
818
 overflow:
819
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
820
            if (    ( roundingMode == float_round_to_zero )
821
                 || ( zSign && ( roundingMode == float_round_up ) )
822
                 || ( ! zSign && ( roundingMode == float_round_down ) )
823
               ) {
824
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
825
            }
826
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
827
        }
828
        if ( zExp <= 0 ) {
829
            isTiny =
830
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
831
                || ( zExp < 0 )
832
                || ! increment
833
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
834
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
835
            zExp = 0;
836
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
837
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
838
            if ( roundNearestEven ) {
839
                increment = ( (int64_t) zSig1 < 0 );
840
            }
841
            else {
842
                if ( zSign ) {
843
                    increment = ( roundingMode == float_round_down ) && zSig1;
844
                }
845
                else {
846
                    increment = ( roundingMode == float_round_up ) && zSig1;
847
                }
848
            }
849
            if ( increment ) {
850
                ++zSig0;
851
                zSig0 &=
852
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
853
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
854
            }
855
            return packFloatx80( zSign, zExp, zSig0 );
856
        }
857
    }
858
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
859
    if ( increment ) {
860
        ++zSig0;
861
        if ( zSig0 == 0 ) {
862
            ++zExp;
863
            zSig0 = LIT64( 0x8000000000000000 );
864
        }
865
        else {
866
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867
        }
868
    }
869
    else {
870
        if ( zSig0 == 0 ) zExp = 0;
871
    }
872
    return packFloatx80( zSign, zExp, zSig0 );
873

    
874
}
875

    
876
/*----------------------------------------------------------------------------
877
| Takes an abstract floating-point value having sign `zSign', exponent
878
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
879
| and returns the proper extended double-precision floating-point value
880
| corresponding to the abstract input.  This routine is just like
881
| `roundAndPackFloatx80' except that the input significand does not have to be
882
| normalized.
883
*----------------------------------------------------------------------------*/
884

    
885
static floatx80
886
 normalizeRoundAndPackFloatx80(
887
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
888
 STATUS_PARAM)
889
{
890
    int8 shiftCount;
891

    
892
    if ( zSig0 == 0 ) {
893
        zSig0 = zSig1;
894
        zSig1 = 0;
895
        zExp -= 64;
896
    }
897
    shiftCount = countLeadingZeros64( zSig0 );
898
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
899
    zExp -= shiftCount;
900
    return
901
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
902

    
903
}
904

    
905
/*----------------------------------------------------------------------------
906
| Returns the least-significant 64 fraction bits of the quadruple-precision
907
| floating-point value `a'.
908
*----------------------------------------------------------------------------*/
909

    
910
INLINE uint64_t extractFloat128Frac1( float128 a )
911
{
912

    
913
    return a.low;
914

    
915
}
916

    
917
/*----------------------------------------------------------------------------
918
| Returns the most-significant 48 fraction bits of the quadruple-precision
919
| floating-point value `a'.
920
*----------------------------------------------------------------------------*/
921

    
922
INLINE uint64_t extractFloat128Frac0( float128 a )
923
{
924

    
925
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
926

    
927
}
928

    
929
/*----------------------------------------------------------------------------
930
| Returns the exponent bits of the quadruple-precision floating-point value
931
| `a'.
932
*----------------------------------------------------------------------------*/
933

    
934
INLINE int32 extractFloat128Exp( float128 a )
935
{
936

    
937
    return ( a.high>>48 ) & 0x7FFF;
938

    
939
}
940

    
941
/*----------------------------------------------------------------------------
942
| Returns the sign bit of the quadruple-precision floating-point value `a'.
943
*----------------------------------------------------------------------------*/
944

    
945
INLINE flag extractFloat128Sign( float128 a )
946
{
947

    
948
    return a.high>>63;
949

    
950
}
951

    
952
/*----------------------------------------------------------------------------
953
| Normalizes the subnormal quadruple-precision floating-point value
954
| represented by the denormalized significand formed by the concatenation of
955
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
956
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
957
| significand are stored at the location pointed to by `zSig0Ptr', and the
958
| least significant 64 bits of the normalized significand are stored at the
959
| location pointed to by `zSig1Ptr'.
960
*----------------------------------------------------------------------------*/
961

    
962
static void
963
 normalizeFloat128Subnormal(
964
     uint64_t aSig0,
965
     uint64_t aSig1,
966
     int32 *zExpPtr,
967
     uint64_t *zSig0Ptr,
968
     uint64_t *zSig1Ptr
969
 )
970
{
971
    int8 shiftCount;
972

    
973
    if ( aSig0 == 0 ) {
974
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
975
        if ( shiftCount < 0 ) {
976
            *zSig0Ptr = aSig1>>( - shiftCount );
977
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
978
        }
979
        else {
980
            *zSig0Ptr = aSig1<<shiftCount;
981
            *zSig1Ptr = 0;
982
        }
983
        *zExpPtr = - shiftCount - 63;
984
    }
985
    else {
986
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
987
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
988
        *zExpPtr = 1 - shiftCount;
989
    }
990

    
991
}
992

    
993
/*----------------------------------------------------------------------------
994
| Packs the sign `zSign', the exponent `zExp', and the significand formed
995
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
996
| floating-point value, returning the result.  After being shifted into the
997
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
998
| added together to form the most significant 32 bits of the result.  This
999
| means that any integer portion of `zSig0' will be added into the exponent.
1000
| Since a properly normalized significand will have an integer portion equal
1001
| to 1, the `zExp' input should be 1 less than the desired result exponent
1002
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1003
| significand.
1004
*----------------------------------------------------------------------------*/
1005

    
1006
INLINE float128
1007
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1008
{
1009
    float128 z;
1010

    
1011
    z.low = zSig1;
1012
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1013
    return z;
1014

    
1015
}
1016

    
1017
/*----------------------------------------------------------------------------
1018
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1019
| and extended significand formed by the concatenation of `zSig0', `zSig1',
1020
| and `zSig2', and returns the proper quadruple-precision floating-point value
1021
| corresponding to the abstract input.  Ordinarily, the abstract value is
1022
| simply rounded and packed into the quadruple-precision format, with the
1023
| inexact exception raised if the abstract input cannot be represented
1024
| exactly.  However, if the abstract value is too large, the overflow and
1025
| inexact exceptions are raised and an infinity or maximal finite value is
1026
| returned.  If the abstract value is too small, the input value is rounded to
1027
| a subnormal number, and the underflow and inexact exceptions are raised if
1028
| the abstract input cannot be represented exactly as a subnormal quadruple-
1029
| precision floating-point number.
1030
|     The input significand must be normalized or smaller.  If the input
1031
| significand is not normalized, `zExp' must be 0; in that case, the result
1032
| returned is a subnormal number, and it must not require rounding.  In the
1033
| usual case that the input significand is normalized, `zExp' must be 1 less
1034
| than the ``true'' floating-point exponent.  The handling of underflow and
1035
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1036
*----------------------------------------------------------------------------*/
1037

    
1038
static float128
1039
 roundAndPackFloat128(
1040
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1041
{
1042
    int8 roundingMode;
1043
    flag roundNearestEven, increment, isTiny;
1044

    
1045
    roundingMode = STATUS(float_rounding_mode);
1046
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1047
    increment = ( (int64_t) zSig2 < 0 );
1048
    if ( ! roundNearestEven ) {
1049
        if ( roundingMode == float_round_to_zero ) {
1050
            increment = 0;
1051
        }
1052
        else {
1053
            if ( zSign ) {
1054
                increment = ( roundingMode == float_round_down ) && zSig2;
1055
            }
1056
            else {
1057
                increment = ( roundingMode == float_round_up ) && zSig2;
1058
            }
1059
        }
1060
    }
1061
    if ( 0x7FFD <= (uint32_t) zExp ) {
1062
        if (    ( 0x7FFD < zExp )
1063
             || (    ( zExp == 0x7FFD )
1064
                  && eq128(
1065
                         LIT64( 0x0001FFFFFFFFFFFF ),
1066
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1067
                         zSig0,
1068
                         zSig1
1069
                     )
1070
                  && increment
1071
                )
1072
           ) {
1073
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1074
            if (    ( roundingMode == float_round_to_zero )
1075
                 || ( zSign && ( roundingMode == float_round_up ) )
1076
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1077
               ) {
1078
                return
1079
                    packFloat128(
1080
                        zSign,
1081
                        0x7FFE,
1082
                        LIT64( 0x0000FFFFFFFFFFFF ),
1083
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1084
                    );
1085
            }
1086
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1087
        }
1088
        if ( zExp < 0 ) {
1089
            if (STATUS(flush_to_zero)) {
1090
                float_raise(float_flag_output_denormal STATUS_VAR);
1091
                return packFloat128(zSign, 0, 0, 0);
1092
            }
1093
            isTiny =
1094
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1095
                || ( zExp < -1 )
1096
                || ! increment
1097
                || lt128(
1098
                       zSig0,
1099
                       zSig1,
1100
                       LIT64( 0x0001FFFFFFFFFFFF ),
1101
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1102
                   );
1103
            shift128ExtraRightJamming(
1104
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1105
            zExp = 0;
1106
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1107
            if ( roundNearestEven ) {
1108
                increment = ( (int64_t) zSig2 < 0 );
1109
            }
1110
            else {
1111
                if ( zSign ) {
1112
                    increment = ( roundingMode == float_round_down ) && zSig2;
1113
                }
1114
                else {
1115
                    increment = ( roundingMode == float_round_up ) && zSig2;
1116
                }
1117
            }
1118
        }
1119
    }
1120
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1121
    if ( increment ) {
1122
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1123
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1124
    }
1125
    else {
1126
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1127
    }
1128
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1129

    
1130
}
1131

    
1132
/*----------------------------------------------------------------------------
1133
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1134
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1135
| returns the proper quadruple-precision floating-point value corresponding
1136
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1137
| except that the input significand has fewer bits and does not have to be
1138
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1139
| point exponent.
1140
*----------------------------------------------------------------------------*/
1141

    
1142
static float128
1143
 normalizeRoundAndPackFloat128(
1144
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1145
{
1146
    int8 shiftCount;
1147
    uint64_t zSig2;
1148

    
1149
    if ( zSig0 == 0 ) {
1150
        zSig0 = zSig1;
1151
        zSig1 = 0;
1152
        zExp -= 64;
1153
    }
1154
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1155
    if ( 0 <= shiftCount ) {
1156
        zSig2 = 0;
1157
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1158
    }
1159
    else {
1160
        shift128ExtraRightJamming(
1161
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1162
    }
1163
    zExp -= shiftCount;
1164
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1165

    
1166
}
1167

    
1168
/*----------------------------------------------------------------------------
1169
| Returns the result of converting the 32-bit two's complement integer `a'
1170
| to the single-precision floating-point format.  The conversion is performed
1171
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1172
*----------------------------------------------------------------------------*/
1173

    
1174
float32 int32_to_float32(int32_t a STATUS_PARAM)
1175
{
1176
    flag zSign;
1177

    
1178
    if ( a == 0 ) return float32_zero;
1179
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1180
    zSign = ( a < 0 );
1181
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1182

    
1183
}
1184

    
1185
/*----------------------------------------------------------------------------
1186
| Returns the result of converting the 32-bit two's complement integer `a'
1187
| to the double-precision floating-point format.  The conversion is performed
1188
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189
*----------------------------------------------------------------------------*/
1190

    
1191
float64 int32_to_float64(int32_t a STATUS_PARAM)
1192
{
1193
    flag zSign;
1194
    uint32 absA;
1195
    int8 shiftCount;
1196
    uint64_t zSig;
1197

    
1198
    if ( a == 0 ) return float64_zero;
1199
    zSign = ( a < 0 );
1200
    absA = zSign ? - a : a;
1201
    shiftCount = countLeadingZeros32( absA ) + 21;
1202
    zSig = absA;
1203
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1204

    
1205
}
1206

    
1207
/*----------------------------------------------------------------------------
1208
| Returns the result of converting the 32-bit two's complement integer `a'
1209
| to the extended double-precision floating-point format.  The conversion
1210
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1211
| Arithmetic.
1212
*----------------------------------------------------------------------------*/
1213

    
1214
floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1215
{
1216
    flag zSign;
1217
    uint32 absA;
1218
    int8 shiftCount;
1219
    uint64_t zSig;
1220

    
1221
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1222
    zSign = ( a < 0 );
1223
    absA = zSign ? - a : a;
1224
    shiftCount = countLeadingZeros32( absA ) + 32;
1225
    zSig = absA;
1226
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1227

    
1228
}
1229

    
1230
/*----------------------------------------------------------------------------
1231
| Returns the result of converting the 32-bit two's complement integer `a' to
1232
| the quadruple-precision floating-point format.  The conversion is performed
1233
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1234
*----------------------------------------------------------------------------*/
1235

    
1236
float128 int32_to_float128(int32_t a STATUS_PARAM)
1237
{
1238
    flag zSign;
1239
    uint32 absA;
1240
    int8 shiftCount;
1241
    uint64_t zSig0;
1242

    
1243
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1244
    zSign = ( a < 0 );
1245
    absA = zSign ? - a : a;
1246
    shiftCount = countLeadingZeros32( absA ) + 17;
1247
    zSig0 = absA;
1248
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1249

    
1250
}
1251

    
1252
/*----------------------------------------------------------------------------
1253
| Returns the result of converting the 64-bit two's complement integer `a'
1254
| to the single-precision floating-point format.  The conversion is performed
1255
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256
*----------------------------------------------------------------------------*/
1257

    
1258
float32 int64_to_float32(int64_t a STATUS_PARAM)
1259
{
1260
    flag zSign;
1261
    uint64 absA;
1262
    int8 shiftCount;
1263

    
1264
    if ( a == 0 ) return float32_zero;
1265
    zSign = ( a < 0 );
1266
    absA = zSign ? - a : a;
1267
    shiftCount = countLeadingZeros64( absA ) - 40;
1268
    if ( 0 <= shiftCount ) {
1269
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1270
    }
1271
    else {
1272
        shiftCount += 7;
1273
        if ( shiftCount < 0 ) {
1274
            shift64RightJamming( absA, - shiftCount, &absA );
1275
        }
1276
        else {
1277
            absA <<= shiftCount;
1278
        }
1279
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1280
    }
1281

    
1282
}
1283

    
1284
float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1285
{
1286
    int8 shiftCount;
1287

    
1288
    if ( a == 0 ) return float32_zero;
1289
    shiftCount = countLeadingZeros64( a ) - 40;
1290
    if ( 0 <= shiftCount ) {
1291
        return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1292
    }
1293
    else {
1294
        shiftCount += 7;
1295
        if ( shiftCount < 0 ) {
1296
            shift64RightJamming( a, - shiftCount, &a );
1297
        }
1298
        else {
1299
            a <<= shiftCount;
1300
        }
1301
        return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1302
    }
1303
}
1304

    
1305
/*----------------------------------------------------------------------------
1306
| Returns the result of converting the 64-bit two's complement integer `a'
1307
| to the double-precision floating-point format.  The conversion is performed
1308
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1309
*----------------------------------------------------------------------------*/
1310

    
1311
float64 int64_to_float64(int64_t a STATUS_PARAM)
1312
{
1313
    flag zSign;
1314

    
1315
    if ( a == 0 ) return float64_zero;
1316
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1317
        return packFloat64( 1, 0x43E, 0 );
1318
    }
1319
    zSign = ( a < 0 );
1320
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1321

    
1322
}
1323

    
1324
float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1325
{
1326
    int exp =  0x43C;
1327

    
1328
    if (a == 0) {
1329
        return float64_zero;
1330
    }
1331
    if ((int64_t)a < 0) {
1332
        shift64RightJamming(a, 1, &a);
1333
        exp += 1;
1334
    }
1335
    return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1336
}
1337

    
1338
/*----------------------------------------------------------------------------
1339
| Returns the result of converting the 64-bit two's complement integer `a'
1340
| to the extended double-precision floating-point format.  The conversion
1341
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1342
| Arithmetic.
1343
*----------------------------------------------------------------------------*/
1344

    
1345
floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1346
{
1347
    flag zSign;
1348
    uint64 absA;
1349
    int8 shiftCount;
1350

    
1351
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1352
    zSign = ( a < 0 );
1353
    absA = zSign ? - a : a;
1354
    shiftCount = countLeadingZeros64( absA );
1355
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1356

    
1357
}
1358

    
1359
/*----------------------------------------------------------------------------
1360
| Returns the result of converting the 64-bit two's complement integer `a' to
1361
| the quadruple-precision floating-point format.  The conversion is performed
1362
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1363
*----------------------------------------------------------------------------*/
1364

    
1365
float128 int64_to_float128(int64_t a STATUS_PARAM)
1366
{
1367
    flag zSign;
1368
    uint64 absA;
1369
    int8 shiftCount;
1370
    int32 zExp;
1371
    uint64_t zSig0, zSig1;
1372

    
1373
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1374
    zSign = ( a < 0 );
1375
    absA = zSign ? - a : a;
1376
    shiftCount = countLeadingZeros64( absA ) + 49;
1377
    zExp = 0x406E - shiftCount;
1378
    if ( 64 <= shiftCount ) {
1379
        zSig1 = 0;
1380
        zSig0 = absA;
1381
        shiftCount -= 64;
1382
    }
1383
    else {
1384
        zSig1 = absA;
1385
        zSig0 = 0;
1386
    }
1387
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1388
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1389

    
1390
}
1391

    
1392
float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1393
{
1394
    if (a == 0) {
1395
        return float128_zero;
1396
    }
1397
    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1398
}
1399

    
1400
/*----------------------------------------------------------------------------
1401
| Returns the result of converting the single-precision floating-point value
1402
| `a' to the 32-bit two's complement integer format.  The conversion is
1403
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1404
| Arithmetic---which means in particular that the conversion is rounded
1405
| according to the current rounding mode.  If `a' is a NaN, the largest
1406
| positive integer is returned.  Otherwise, if the conversion overflows, the
1407
| largest integer with the same sign as `a' is returned.
1408
*----------------------------------------------------------------------------*/
1409

    
1410
int32 float32_to_int32( float32 a STATUS_PARAM )
1411
{
1412
    flag aSign;
1413
    int_fast16_t aExp, shiftCount;
1414
    uint32_t aSig;
1415
    uint64_t aSig64;
1416

    
1417
    a = float32_squash_input_denormal(a STATUS_VAR);
1418
    aSig = extractFloat32Frac( a );
1419
    aExp = extractFloat32Exp( a );
1420
    aSign = extractFloat32Sign( a );
1421
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1422
    if ( aExp ) aSig |= 0x00800000;
1423
    shiftCount = 0xAF - aExp;
1424
    aSig64 = aSig;
1425
    aSig64 <<= 32;
1426
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1427
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1428

    
1429
}
1430

    
1431
/*----------------------------------------------------------------------------
1432
| Returns the result of converting the single-precision floating-point value
1433
| `a' to the 32-bit two's complement integer format.  The conversion is
1434
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1435
| Arithmetic, except that the conversion is always rounded toward zero.
1436
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1437
| the conversion overflows, the largest integer with the same sign as `a' is
1438
| returned.
1439
*----------------------------------------------------------------------------*/
1440

    
1441
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1442
{
1443
    flag aSign;
1444
    int_fast16_t aExp, shiftCount;
1445
    uint32_t aSig;
1446
    int32_t z;
1447
    a = float32_squash_input_denormal(a STATUS_VAR);
1448

    
1449
    aSig = extractFloat32Frac( a );
1450
    aExp = extractFloat32Exp( a );
1451
    aSign = extractFloat32Sign( a );
1452
    shiftCount = aExp - 0x9E;
1453
    if ( 0 <= shiftCount ) {
1454
        if ( float32_val(a) != 0xCF000000 ) {
1455
            float_raise( float_flag_invalid STATUS_VAR);
1456
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1457
        }
1458
        return (int32_t) 0x80000000;
1459
    }
1460
    else if ( aExp <= 0x7E ) {
1461
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1462
        return 0;
1463
    }
1464
    aSig = ( aSig | 0x00800000 )<<8;
1465
    z = aSig>>( - shiftCount );
1466
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1467
        STATUS(float_exception_flags) |= float_flag_inexact;
1468
    }
1469
    if ( aSign ) z = - z;
1470
    return z;
1471

    
1472
}
1473

    
1474
/*----------------------------------------------------------------------------
1475
| Returns the result of converting the single-precision floating-point value
1476
| `a' to the 16-bit two's complement integer format.  The conversion is
1477
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1478
| Arithmetic, except that the conversion is always rounded toward zero.
1479
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1480
| the conversion overflows, the largest integer with the same sign as `a' is
1481
| returned.
1482
*----------------------------------------------------------------------------*/
1483

    
1484
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1485
{
1486
    flag aSign;
1487
    int_fast16_t aExp, shiftCount;
1488
    uint32_t aSig;
1489
    int32 z;
1490

    
1491
    aSig = extractFloat32Frac( a );
1492
    aExp = extractFloat32Exp( a );
1493
    aSign = extractFloat32Sign( a );
1494
    shiftCount = aExp - 0x8E;
1495
    if ( 0 <= shiftCount ) {
1496
        if ( float32_val(a) != 0xC7000000 ) {
1497
            float_raise( float_flag_invalid STATUS_VAR);
1498
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1499
                return 0x7FFF;
1500
            }
1501
        }
1502
        return (int32_t) 0xffff8000;
1503
    }
1504
    else if ( aExp <= 0x7E ) {
1505
        if ( aExp | aSig ) {
1506
            STATUS(float_exception_flags) |= float_flag_inexact;
1507
        }
1508
        return 0;
1509
    }
1510
    shiftCount -= 0x10;
1511
    aSig = ( aSig | 0x00800000 )<<8;
1512
    z = aSig>>( - shiftCount );
1513
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1514
        STATUS(float_exception_flags) |= float_flag_inexact;
1515
    }
1516
    if ( aSign ) {
1517
        z = - z;
1518
    }
1519
    return z;
1520

    
1521
}
1522

    
1523
/*----------------------------------------------------------------------------
1524
| Returns the result of converting the single-precision floating-point value
1525
| `a' to the 64-bit two's complement integer format.  The conversion is
1526
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1527
| Arithmetic---which means in particular that the conversion is rounded
1528
| according to the current rounding mode.  If `a' is a NaN, the largest
1529
| positive integer is returned.  Otherwise, if the conversion overflows, the
1530
| largest integer with the same sign as `a' is returned.
1531
*----------------------------------------------------------------------------*/
1532

    
1533
int64 float32_to_int64( float32 a STATUS_PARAM )
1534
{
1535
    flag aSign;
1536
    int_fast16_t aExp, shiftCount;
1537
    uint32_t aSig;
1538
    uint64_t aSig64, aSigExtra;
1539
    a = float32_squash_input_denormal(a STATUS_VAR);
1540

    
1541
    aSig = extractFloat32Frac( a );
1542
    aExp = extractFloat32Exp( a );
1543
    aSign = extractFloat32Sign( a );
1544
    shiftCount = 0xBE - aExp;
1545
    if ( shiftCount < 0 ) {
1546
        float_raise( float_flag_invalid STATUS_VAR);
1547
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1548
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1549
        }
1550
        return (int64_t) LIT64( 0x8000000000000000 );
1551
    }
1552
    if ( aExp ) aSig |= 0x00800000;
1553
    aSig64 = aSig;
1554
    aSig64 <<= 40;
1555
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1556
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1557

    
1558
}
1559

    
1560
/*----------------------------------------------------------------------------
1561
| Returns the result of converting the single-precision floating-point value
1562
| `a' to the 64-bit unsigned integer format.  The conversion is
1563
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1564
| Arithmetic---which means in particular that the conversion is rounded
1565
| according to the current rounding mode.  If `a' is a NaN, the largest
1566
| unsigned integer is returned.  Otherwise, if the conversion overflows, the
1567
| largest unsigned integer is returned.  If the 'a' is negative, the result
1568
| is rounded and zero is returned; values that do not round to zero will
1569
| raise the inexact exception flag.
1570
*----------------------------------------------------------------------------*/
1571

    
1572
uint64 float32_to_uint64(float32 a STATUS_PARAM)
1573
{
1574
    flag aSign;
1575
    int_fast16_t aExp, shiftCount;
1576
    uint32_t aSig;
1577
    uint64_t aSig64, aSigExtra;
1578
    a = float32_squash_input_denormal(a STATUS_VAR);
1579

    
1580
    aSig = extractFloat32Frac(a);
1581
    aExp = extractFloat32Exp(a);
1582
    aSign = extractFloat32Sign(a);
1583
    if ((aSign) && (aExp > 126)) {
1584
        float_raise(float_flag_invalid STATUS_VAR);
1585
        if (float32_is_any_nan(a)) {
1586
            return LIT64(0xFFFFFFFFFFFFFFFF);
1587
        } else {
1588
            return 0;
1589
        }
1590
    }
1591
    shiftCount = 0xBE - aExp;
1592
    if (aExp) {
1593
        aSig |= 0x00800000;
1594
    }
1595
    if (shiftCount < 0) {
1596
        float_raise(float_flag_invalid STATUS_VAR);
1597
        return LIT64(0xFFFFFFFFFFFFFFFF);
1598
    }
1599

    
1600
    aSig64 = aSig;
1601
    aSig64 <<= 40;
1602
    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1603
    return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1604
}
1605

    
1606
/*----------------------------------------------------------------------------
1607
| Returns the result of converting the single-precision floating-point value
1608
| `a' to the 64-bit two's complement integer format.  The conversion is
1609
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1610
| Arithmetic, except that the conversion is always rounded toward zero.  If
1611
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1612
| conversion overflows, the largest integer with the same sign as `a' is
1613
| returned.
1614
*----------------------------------------------------------------------------*/
1615

    
1616
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1617
{
1618
    flag aSign;
1619
    int_fast16_t aExp, shiftCount;
1620
    uint32_t aSig;
1621
    uint64_t aSig64;
1622
    int64 z;
1623
    a = float32_squash_input_denormal(a STATUS_VAR);
1624

    
1625
    aSig = extractFloat32Frac( a );
1626
    aExp = extractFloat32Exp( a );
1627
    aSign = extractFloat32Sign( a );
1628
    shiftCount = aExp - 0xBE;
1629
    if ( 0 <= shiftCount ) {
1630
        if ( float32_val(a) != 0xDF000000 ) {
1631
            float_raise( float_flag_invalid STATUS_VAR);
1632
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1633
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1634
            }
1635
        }
1636
        return (int64_t) LIT64( 0x8000000000000000 );
1637
    }
1638
    else if ( aExp <= 0x7E ) {
1639
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1640
        return 0;
1641
    }
1642
    aSig64 = aSig | 0x00800000;
1643
    aSig64 <<= 40;
1644
    z = aSig64>>( - shiftCount );
1645
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1646
        STATUS(float_exception_flags) |= float_flag_inexact;
1647
    }
1648
    if ( aSign ) z = - z;
1649
    return z;
1650

    
1651
}
1652

    
1653
/*----------------------------------------------------------------------------
1654
| Returns the result of converting the single-precision floating-point value
1655
| `a' to the double-precision floating-point format.  The conversion is
1656
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1657
| Arithmetic.
1658
*----------------------------------------------------------------------------*/
1659

    
1660
float64 float32_to_float64( float32 a STATUS_PARAM )
1661
{
1662
    flag aSign;
1663
    int_fast16_t aExp;
1664
    uint32_t aSig;
1665
    a = float32_squash_input_denormal(a STATUS_VAR);
1666

    
1667
    aSig = extractFloat32Frac( a );
1668
    aExp = extractFloat32Exp( a );
1669
    aSign = extractFloat32Sign( a );
1670
    if ( aExp == 0xFF ) {
1671
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1672
        return packFloat64( aSign, 0x7FF, 0 );
1673
    }
1674
    if ( aExp == 0 ) {
1675
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1676
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1677
        --aExp;
1678
    }
1679
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1680

    
1681
}
1682

    
1683
/*----------------------------------------------------------------------------
1684
| Returns the result of converting the single-precision floating-point value
1685
| `a' to the extended double-precision floating-point format.  The conversion
1686
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1687
| Arithmetic.
1688
*----------------------------------------------------------------------------*/
1689

    
1690
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1691
{
1692
    flag aSign;
1693
    int_fast16_t aExp;
1694
    uint32_t aSig;
1695

    
1696
    a = float32_squash_input_denormal(a STATUS_VAR);
1697
    aSig = extractFloat32Frac( a );
1698
    aExp = extractFloat32Exp( a );
1699
    aSign = extractFloat32Sign( a );
1700
    if ( aExp == 0xFF ) {
1701
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1702
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1703
    }
1704
    if ( aExp == 0 ) {
1705
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1706
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1707
    }
1708
    aSig |= 0x00800000;
1709
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1710

    
1711
}
1712

    
1713
/*----------------------------------------------------------------------------
1714
| Returns the result of converting the single-precision floating-point value
1715
| `a' to the double-precision floating-point format.  The conversion is
1716
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1717
| Arithmetic.
1718
*----------------------------------------------------------------------------*/
1719

    
1720
float128 float32_to_float128( float32 a STATUS_PARAM )
1721
{
1722
    flag aSign;
1723
    int_fast16_t aExp;
1724
    uint32_t aSig;
1725

    
1726
    a = float32_squash_input_denormal(a STATUS_VAR);
1727
    aSig = extractFloat32Frac( a );
1728
    aExp = extractFloat32Exp( a );
1729
    aSign = extractFloat32Sign( a );
1730
    if ( aExp == 0xFF ) {
1731
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1732
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1733
    }
1734
    if ( aExp == 0 ) {
1735
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1736
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1737
        --aExp;
1738
    }
1739
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1740

    
1741
}
1742

    
1743
/*----------------------------------------------------------------------------
1744
| Rounds the single-precision floating-point value `a' to an integer, and
1745
| returns the result as a single-precision floating-point value.  The
1746
| operation is performed according to the IEC/IEEE Standard for Binary
1747
| Floating-Point Arithmetic.
1748
*----------------------------------------------------------------------------*/
1749

    
1750
float32 float32_round_to_int( float32 a STATUS_PARAM)
1751
{
1752
    flag aSign;
1753
    int_fast16_t aExp;
1754
    uint32_t lastBitMask, roundBitsMask;
1755
    int8 roundingMode;
1756
    uint32_t z;
1757
    a = float32_squash_input_denormal(a STATUS_VAR);
1758

    
1759
    aExp = extractFloat32Exp( a );
1760
    if ( 0x96 <= aExp ) {
1761
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1762
            return propagateFloat32NaN( a, a STATUS_VAR );
1763
        }
1764
        return a;
1765
    }
1766
    if ( aExp <= 0x7E ) {
1767
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1768
        STATUS(float_exception_flags) |= float_flag_inexact;
1769
        aSign = extractFloat32Sign( a );
1770
        switch ( STATUS(float_rounding_mode) ) {
1771
         case float_round_nearest_even:
1772
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1773
                return packFloat32( aSign, 0x7F, 0 );
1774
            }
1775
            break;
1776
         case float_round_down:
1777
            return make_float32(aSign ? 0xBF800000 : 0);
1778
         case float_round_up:
1779
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1780
        }
1781
        return packFloat32( aSign, 0, 0 );
1782
    }
1783
    lastBitMask = 1;
1784
    lastBitMask <<= 0x96 - aExp;
1785
    roundBitsMask = lastBitMask - 1;
1786
    z = float32_val(a);
1787
    roundingMode = STATUS(float_rounding_mode);
1788
    if ( roundingMode == float_round_nearest_even ) {
1789
        z += lastBitMask>>1;
1790
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1791
    }
1792
    else if ( roundingMode != float_round_to_zero ) {
1793
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1794
            z += roundBitsMask;
1795
        }
1796
    }
1797
    z &= ~ roundBitsMask;
1798
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1799
    return make_float32(z);
1800

    
1801
}
1802

    
1803
/*----------------------------------------------------------------------------
1804
| Returns the result of adding the absolute values of the single-precision
1805
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1806
| before being returned.  `zSign' is ignored if the result is a NaN.
1807
| The addition is performed according to the IEC/IEEE Standard for Binary
1808
| Floating-Point Arithmetic.
1809
*----------------------------------------------------------------------------*/
1810

    
1811
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1812
{
1813
    int_fast16_t aExp, bExp, zExp;
1814
    uint32_t aSig, bSig, zSig;
1815
    int_fast16_t expDiff;
1816

    
1817
    aSig = extractFloat32Frac( a );
1818
    aExp = extractFloat32Exp( a );
1819
    bSig = extractFloat32Frac( b );
1820
    bExp = extractFloat32Exp( b );
1821
    expDiff = aExp - bExp;
1822
    aSig <<= 6;
1823
    bSig <<= 6;
1824
    if ( 0 < expDiff ) {
1825
        if ( aExp == 0xFF ) {
1826
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827
            return a;
1828
        }
1829
        if ( bExp == 0 ) {
1830
            --expDiff;
1831
        }
1832
        else {
1833
            bSig |= 0x20000000;
1834
        }
1835
        shift32RightJamming( bSig, expDiff, &bSig );
1836
        zExp = aExp;
1837
    }
1838
    else if ( expDiff < 0 ) {
1839
        if ( bExp == 0xFF ) {
1840
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1841
            return packFloat32( zSign, 0xFF, 0 );
1842
        }
1843
        if ( aExp == 0 ) {
1844
            ++expDiff;
1845
        }
1846
        else {
1847
            aSig |= 0x20000000;
1848
        }
1849
        shift32RightJamming( aSig, - expDiff, &aSig );
1850
        zExp = bExp;
1851
    }
1852
    else {
1853
        if ( aExp == 0xFF ) {
1854
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855
            return a;
1856
        }
1857
        if ( aExp == 0 ) {
1858
            if (STATUS(flush_to_zero)) {
1859
                if (aSig | bSig) {
1860
                    float_raise(float_flag_output_denormal STATUS_VAR);
1861
                }
1862
                return packFloat32(zSign, 0, 0);
1863
            }
1864
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1865
        }
1866
        zSig = 0x40000000 + aSig + bSig;
1867
        zExp = aExp;
1868
        goto roundAndPack;
1869
    }
1870
    aSig |= 0x20000000;
1871
    zSig = ( aSig + bSig )<<1;
1872
    --zExp;
1873
    if ( (int32_t) zSig < 0 ) {
1874
        zSig = aSig + bSig;
1875
        ++zExp;
1876
    }
1877
 roundAndPack:
1878
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1879

    
1880
}
1881

    
1882
/*----------------------------------------------------------------------------
1883
| Returns the result of subtracting the absolute values of the single-
1884
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1885
| difference is negated before being returned.  `zSign' is ignored if the
1886
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1887
| Standard for Binary Floating-Point Arithmetic.
1888
*----------------------------------------------------------------------------*/
1889

    
1890
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1891
{
1892
    int_fast16_t aExp, bExp, zExp;
1893
    uint32_t aSig, bSig, zSig;
1894
    int_fast16_t expDiff;
1895

    
1896
    aSig = extractFloat32Frac( a );
1897
    aExp = extractFloat32Exp( a );
1898
    bSig = extractFloat32Frac( b );
1899
    bExp = extractFloat32Exp( b );
1900
    expDiff = aExp - bExp;
1901
    aSig <<= 7;
1902
    bSig <<= 7;
1903
    if ( 0 < expDiff ) goto aExpBigger;
1904
    if ( expDiff < 0 ) goto bExpBigger;
1905
    if ( aExp == 0xFF ) {
1906
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1907
        float_raise( float_flag_invalid STATUS_VAR);
1908
        return float32_default_nan;
1909
    }
1910
    if ( aExp == 0 ) {
1911
        aExp = 1;
1912
        bExp = 1;
1913
    }
1914
    if ( bSig < aSig ) goto aBigger;
1915
    if ( aSig < bSig ) goto bBigger;
1916
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1917
 bExpBigger:
1918
    if ( bExp == 0xFF ) {
1919
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1920
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1921
    }
1922
    if ( aExp == 0 ) {
1923
        ++expDiff;
1924
    }
1925
    else {
1926
        aSig |= 0x40000000;
1927
    }
1928
    shift32RightJamming( aSig, - expDiff, &aSig );
1929
    bSig |= 0x40000000;
1930
 bBigger:
1931
    zSig = bSig - aSig;
1932
    zExp = bExp;
1933
    zSign ^= 1;
1934
    goto normalizeRoundAndPack;
1935
 aExpBigger:
1936
    if ( aExp == 0xFF ) {
1937
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1938
        return a;
1939
    }
1940
    if ( bExp == 0 ) {
1941
        --expDiff;
1942
    }
1943
    else {
1944
        bSig |= 0x40000000;
1945
    }
1946
    shift32RightJamming( bSig, expDiff, &bSig );
1947
    aSig |= 0x40000000;
1948
 aBigger:
1949
    zSig = aSig - bSig;
1950
    zExp = aExp;
1951
 normalizeRoundAndPack:
1952
    --zExp;
1953
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1954

    
1955
}
1956

    
1957
/*----------------------------------------------------------------------------
1958
| Returns the result of adding the single-precision floating-point values `a'
1959
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1960
| Binary Floating-Point Arithmetic.
1961
*----------------------------------------------------------------------------*/
1962

    
1963
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1964
{
1965
    flag aSign, bSign;
1966
    a = float32_squash_input_denormal(a STATUS_VAR);
1967
    b = float32_squash_input_denormal(b STATUS_VAR);
1968

    
1969
    aSign = extractFloat32Sign( a );
1970
    bSign = extractFloat32Sign( b );
1971
    if ( aSign == bSign ) {
1972
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1973
    }
1974
    else {
1975
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1976
    }
1977

    
1978
}
1979

    
1980
/*----------------------------------------------------------------------------
1981
| Returns the result of subtracting the single-precision floating-point values
1982
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1983
| for Binary Floating-Point Arithmetic.
1984
*----------------------------------------------------------------------------*/
1985

    
1986
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1987
{
1988
    flag aSign, bSign;
1989
    a = float32_squash_input_denormal(a STATUS_VAR);
1990
    b = float32_squash_input_denormal(b STATUS_VAR);
1991

    
1992
    aSign = extractFloat32Sign( a );
1993
    bSign = extractFloat32Sign( b );
1994
    if ( aSign == bSign ) {
1995
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1996
    }
1997
    else {
1998
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1999
    }
2000

    
2001
}
2002

    
2003
/*----------------------------------------------------------------------------
2004
| Returns the result of multiplying the single-precision floating-point values
2005
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2006
| for Binary Floating-Point Arithmetic.
2007
*----------------------------------------------------------------------------*/
2008

    
2009
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2010
{
2011
    flag aSign, bSign, zSign;
2012
    int_fast16_t aExp, bExp, zExp;
2013
    uint32_t aSig, bSig;
2014
    uint64_t zSig64;
2015
    uint32_t zSig;
2016

    
2017
    a = float32_squash_input_denormal(a STATUS_VAR);
2018
    b = float32_squash_input_denormal(b STATUS_VAR);
2019

    
2020
    aSig = extractFloat32Frac( a );
2021
    aExp = extractFloat32Exp( a );
2022
    aSign = extractFloat32Sign( a );
2023
    bSig = extractFloat32Frac( b );
2024
    bExp = extractFloat32Exp( b );
2025
    bSign = extractFloat32Sign( b );
2026
    zSign = aSign ^ bSign;
2027
    if ( aExp == 0xFF ) {
2028
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2029
            return propagateFloat32NaN( a, b STATUS_VAR );
2030
        }
2031
        if ( ( bExp | bSig ) == 0 ) {
2032
            float_raise( float_flag_invalid STATUS_VAR);
2033
            return float32_default_nan;
2034
        }
2035
        return packFloat32( zSign, 0xFF, 0 );
2036
    }
2037
    if ( bExp == 0xFF ) {
2038
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2039
        if ( ( aExp | aSig ) == 0 ) {
2040
            float_raise( float_flag_invalid STATUS_VAR);
2041
            return float32_default_nan;
2042
        }
2043
        return packFloat32( zSign, 0xFF, 0 );
2044
    }
2045
    if ( aExp == 0 ) {
2046
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2047
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2048
    }
2049
    if ( bExp == 0 ) {
2050
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2051
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2052
    }
2053
    zExp = aExp + bExp - 0x7F;
2054
    aSig = ( aSig | 0x00800000 )<<7;
2055
    bSig = ( bSig | 0x00800000 )<<8;
2056
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2057
    zSig = zSig64;
2058
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2059
        zSig <<= 1;
2060
        --zExp;
2061
    }
2062
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2063

    
2064
}
2065

    
2066
/*----------------------------------------------------------------------------
2067
| Returns the result of dividing the single-precision floating-point value `a'
2068
| by the corresponding value `b'.  The operation is performed according to the
2069
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2070
*----------------------------------------------------------------------------*/
2071

    
2072
float32 float32_div( float32 a, float32 b STATUS_PARAM )
2073
{
2074
    flag aSign, bSign, zSign;
2075
    int_fast16_t aExp, bExp, zExp;
2076
    uint32_t aSig, bSig, zSig;
2077
    a = float32_squash_input_denormal(a STATUS_VAR);
2078
    b = float32_squash_input_denormal(b STATUS_VAR);
2079

    
2080
    aSig = extractFloat32Frac( a );
2081
    aExp = extractFloat32Exp( a );
2082
    aSign = extractFloat32Sign( a );
2083
    bSig = extractFloat32Frac( b );
2084
    bExp = extractFloat32Exp( b );
2085
    bSign = extractFloat32Sign( b );
2086
    zSign = aSign ^ bSign;
2087
    if ( aExp == 0xFF ) {
2088
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2089
        if ( bExp == 0xFF ) {
2090
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2091
            float_raise( float_flag_invalid STATUS_VAR);
2092
            return float32_default_nan;
2093
        }
2094
        return packFloat32( zSign, 0xFF, 0 );
2095
    }
2096
    if ( bExp == 0xFF ) {
2097
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2098
        return packFloat32( zSign, 0, 0 );
2099
    }
2100
    if ( bExp == 0 ) {
2101
        if ( bSig == 0 ) {
2102
            if ( ( aExp | aSig ) == 0 ) {
2103
                float_raise( float_flag_invalid STATUS_VAR);
2104
                return float32_default_nan;
2105
            }
2106
            float_raise( float_flag_divbyzero STATUS_VAR);
2107
            return packFloat32( zSign, 0xFF, 0 );
2108
        }
2109
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2110
    }
2111
    if ( aExp == 0 ) {
2112
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2113
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2114
    }
2115
    zExp = aExp - bExp + 0x7D;
2116
    aSig = ( aSig | 0x00800000 )<<7;
2117
    bSig = ( bSig | 0x00800000 )<<8;
2118
    if ( bSig <= ( aSig + aSig ) ) {
2119
        aSig >>= 1;
2120
        ++zExp;
2121
    }
2122
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2123
    if ( ( zSig & 0x3F ) == 0 ) {
2124
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2125
    }
2126
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2127

    
2128
}
2129

    
2130
/*----------------------------------------------------------------------------
2131
| Returns the remainder of the single-precision floating-point value `a'
2132
| with respect to the corresponding value `b'.  The operation is performed
2133
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2134
*----------------------------------------------------------------------------*/
2135

    
2136
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2137
{
2138
    flag aSign, zSign;
2139
    int_fast16_t aExp, bExp, expDiff;
2140
    uint32_t aSig, bSig;
2141
    uint32_t q;
2142
    uint64_t aSig64, bSig64, q64;
2143
    uint32_t alternateASig;
2144
    int32_t sigMean;
2145
    a = float32_squash_input_denormal(a STATUS_VAR);
2146
    b = float32_squash_input_denormal(b STATUS_VAR);
2147

    
2148
    aSig = extractFloat32Frac( a );
2149
    aExp = extractFloat32Exp( a );
2150
    aSign = extractFloat32Sign( a );
2151
    bSig = extractFloat32Frac( b );
2152
    bExp = extractFloat32Exp( b );
2153
    if ( aExp == 0xFF ) {
2154
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2155
            return propagateFloat32NaN( a, b STATUS_VAR );
2156
        }
2157
        float_raise( float_flag_invalid STATUS_VAR);
2158
        return float32_default_nan;
2159
    }
2160
    if ( bExp == 0xFF ) {
2161
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2162
        return a;
2163
    }
2164
    if ( bExp == 0 ) {
2165
        if ( bSig == 0 ) {
2166
            float_raise( float_flag_invalid STATUS_VAR);
2167
            return float32_default_nan;
2168
        }
2169
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2170
    }
2171
    if ( aExp == 0 ) {
2172
        if ( aSig == 0 ) return a;
2173
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2174
    }
2175
    expDiff = aExp - bExp;
2176
    aSig |= 0x00800000;
2177
    bSig |= 0x00800000;
2178
    if ( expDiff < 32 ) {
2179
        aSig <<= 8;
2180
        bSig <<= 8;
2181
        if ( expDiff < 0 ) {
2182
            if ( expDiff < -1 ) return a;
2183
            aSig >>= 1;
2184
        }
2185
        q = ( bSig <= aSig );
2186
        if ( q ) aSig -= bSig;
2187
        if ( 0 < expDiff ) {
2188
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2189
            q >>= 32 - expDiff;
2190
            bSig >>= 2;
2191
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2192
        }
2193
        else {
2194
            aSig >>= 2;
2195
            bSig >>= 2;
2196
        }
2197
    }
2198
    else {
2199
        if ( bSig <= aSig ) aSig -= bSig;
2200
        aSig64 = ( (uint64_t) aSig )<<40;
2201
        bSig64 = ( (uint64_t) bSig )<<40;
2202
        expDiff -= 64;
2203
        while ( 0 < expDiff ) {
2204
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2205
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2206
            aSig64 = - ( ( bSig * q64 )<<38 );
2207
            expDiff -= 62;
2208
        }
2209
        expDiff += 64;
2210
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2211
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2212
        q = q64>>( 64 - expDiff );
2213
        bSig <<= 6;
2214
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2215
    }
2216
    do {
2217
        alternateASig = aSig;
2218
        ++q;
2219
        aSig -= bSig;
2220
    } while ( 0 <= (int32_t) aSig );
2221
    sigMean = aSig + alternateASig;
2222
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2223
        aSig = alternateASig;
2224
    }
2225
    zSign = ( (int32_t) aSig < 0 );
2226
    if ( zSign ) aSig = - aSig;
2227
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2228

    
2229
}
2230

    
2231
/*----------------------------------------------------------------------------
2232
| Returns the result of multiplying the single-precision floating-point values
2233
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2234
| multiplication.  The operation is performed according to the IEC/IEEE
2235
| Standard for Binary Floating-Point Arithmetic 754-2008.
2236
| The flags argument allows the caller to select negation of the
2237
| addend, the intermediate product, or the final result. (The difference
2238
| between this and having the caller do a separate negation is that negating
2239
| externally will flip the sign bit on NaNs.)
2240
*----------------------------------------------------------------------------*/
2241

    
2242
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2243
{
2244
    flag aSign, bSign, cSign, zSign;
2245
    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2246
    uint32_t aSig, bSig, cSig;
2247
    flag pInf, pZero, pSign;
2248
    uint64_t pSig64, cSig64, zSig64;
2249
    uint32_t pSig;
2250
    int shiftcount;
2251
    flag signflip, infzero;
2252

    
2253
    a = float32_squash_input_denormal(a STATUS_VAR);
2254
    b = float32_squash_input_denormal(b STATUS_VAR);
2255
    c = float32_squash_input_denormal(c STATUS_VAR);
2256
    aSig = extractFloat32Frac(a);
2257
    aExp = extractFloat32Exp(a);
2258
    aSign = extractFloat32Sign(a);
2259
    bSig = extractFloat32Frac(b);
2260
    bExp = extractFloat32Exp(b);
2261
    bSign = extractFloat32Sign(b);
2262
    cSig = extractFloat32Frac(c);
2263
    cExp = extractFloat32Exp(c);
2264
    cSign = extractFloat32Sign(c);
2265

    
2266
    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2267
               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2268

    
2269
    /* It is implementation-defined whether the cases of (0,inf,qnan)
2270
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2271
     * they return if they do), so we have to hand this information
2272
     * off to the target-specific pick-a-NaN routine.
2273
     */
2274
    if (((aExp == 0xff) && aSig) ||
2275
        ((bExp == 0xff) && bSig) ||
2276
        ((cExp == 0xff) && cSig)) {
2277
        return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2278
    }
2279

    
2280
    if (infzero) {
2281
        float_raise(float_flag_invalid STATUS_VAR);
2282
        return float32_default_nan;
2283
    }
2284

    
2285
    if (flags & float_muladd_negate_c) {
2286
        cSign ^= 1;
2287
    }
2288

    
2289
    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2290

    
2291
    /* Work out the sign and type of the product */
2292
    pSign = aSign ^ bSign;
2293
    if (flags & float_muladd_negate_product) {
2294
        pSign ^= 1;
2295
    }
2296
    pInf = (aExp == 0xff) || (bExp == 0xff);
2297
    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2298

    
2299
    if (cExp == 0xff) {
2300
        if (pInf && (pSign ^ cSign)) {
2301
            /* addition of opposite-signed infinities => InvalidOperation */
2302
            float_raise(float_flag_invalid STATUS_VAR);
2303
            return float32_default_nan;
2304
        }
2305
        /* Otherwise generate an infinity of the same sign */
2306
        return packFloat32(cSign ^ signflip, 0xff, 0);
2307
    }
2308

    
2309
    if (pInf) {
2310
        return packFloat32(pSign ^ signflip, 0xff, 0);
2311
    }
2312

    
2313
    if (pZero) {
2314
        if (cExp == 0) {
2315
            if (cSig == 0) {
2316
                /* Adding two exact zeroes */
2317
                if (pSign == cSign) {
2318
                    zSign = pSign;
2319
                } else if (STATUS(float_rounding_mode) == float_round_down) {
2320
                    zSign = 1;
2321
                } else {
2322
                    zSign = 0;
2323
                }
2324
                return packFloat32(zSign ^ signflip, 0, 0);
2325
            }
2326
            /* Exact zero plus a denorm */
2327
            if (STATUS(flush_to_zero)) {
2328
                float_raise(float_flag_output_denormal STATUS_VAR);
2329
                return packFloat32(cSign ^ signflip, 0, 0);
2330
            }
2331
        }
2332
        /* Zero plus something non-zero : just return the something */
2333
        return packFloat32(cSign ^ signflip, cExp, cSig);
2334
    }
2335

    
2336
    if (aExp == 0) {
2337
        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2338
    }
2339
    if (bExp == 0) {
2340
        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2341
    }
2342

    
2343
    /* Calculate the actual result a * b + c */
2344

    
2345
    /* Multiply first; this is easy. */
2346
    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2347
     * because we want the true exponent, not the "one-less-than"
2348
     * flavour that roundAndPackFloat32() takes.
2349
     */
2350
    pExp = aExp + bExp - 0x7e;
2351
    aSig = (aSig | 0x00800000) << 7;
2352
    bSig = (bSig | 0x00800000) << 8;
2353
    pSig64 = (uint64_t)aSig * bSig;
2354
    if ((int64_t)(pSig64 << 1) >= 0) {
2355
        pSig64 <<= 1;
2356
        pExp--;
2357
    }
2358

    
2359
    zSign = pSign ^ signflip;
2360

    
2361
    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2362
     * position 62.
2363
     */
2364
    if (cExp == 0) {
2365
        if (!cSig) {
2366
            /* Throw out the special case of c being an exact zero now */
2367
            shift64RightJamming(pSig64, 32, &pSig64);
2368
            pSig = pSig64;
2369
            return roundAndPackFloat32(zSign, pExp - 1,
2370
                                       pSig STATUS_VAR);
2371
        }
2372
        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2373
    }
2374

    
2375
    cSig64 = (uint64_t)cSig << (62 - 23);
2376
    cSig64 |= LIT64(0x4000000000000000);
2377
    expDiff = pExp - cExp;
2378

    
2379
    if (pSign == cSign) {
2380
        /* Addition */
2381
        if (expDiff > 0) {
2382
            /* scale c to match p */
2383
            shift64RightJamming(cSig64, expDiff, &cSig64);
2384
            zExp = pExp;
2385
        } else if (expDiff < 0) {
2386
            /* scale p to match c */
2387
            shift64RightJamming(pSig64, -expDiff, &pSig64);
2388
            zExp = cExp;
2389
        } else {
2390
            /* no scaling needed */
2391
            zExp = cExp;
2392
        }
2393
        /* Add significands and make sure explicit bit ends up in posn 62 */
2394
        zSig64 = pSig64 + cSig64;
2395
        if ((int64_t)zSig64 < 0) {
2396
            shift64RightJamming(zSig64, 1, &zSig64);
2397
        } else {
2398
            zExp--;
2399
        }
2400
    } else {
2401
        /* Subtraction */
2402
        if (expDiff > 0) {
2403
            shift64RightJamming(cSig64, expDiff, &cSig64);
2404
            zSig64 = pSig64 - cSig64;
2405
            zExp = pExp;
2406
        } else if (expDiff < 0) {
2407
            shift64RightJamming(pSig64, -expDiff, &pSig64);
2408
            zSig64 = cSig64 - pSig64;
2409
            zExp = cExp;
2410
            zSign ^= 1;
2411
        } else {
2412
            zExp = pExp;
2413
            if (cSig64 < pSig64) {
2414
                zSig64 = pSig64 - cSig64;
2415
            } else if (pSig64 < cSig64) {
2416
                zSig64 = cSig64 - pSig64;
2417
                zSign ^= 1;
2418
            } else {
2419
                /* Exact zero */
2420
                zSign = signflip;
2421
                if (STATUS(float_rounding_mode) == float_round_down) {
2422
                    zSign ^= 1;
2423
                }
2424
                return packFloat32(zSign, 0, 0);
2425
            }
2426
        }
2427
        --zExp;
2428
        /* Normalize to put the explicit bit back into bit 62. */
2429
        shiftcount = countLeadingZeros64(zSig64) - 1;
2430
        zSig64 <<= shiftcount;
2431
        zExp -= shiftcount;
2432
    }
2433
    shift64RightJamming(zSig64, 32, &zSig64);
2434
    return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2435
}
2436

    
2437

    
2438
/*----------------------------------------------------------------------------
2439
| Returns the square root of the single-precision floating-point value `a'.
2440
| The operation is performed according to the IEC/IEEE Standard for Binary
2441
| Floating-Point Arithmetic.
2442
*----------------------------------------------------------------------------*/
2443

    
2444
float32 float32_sqrt( float32 a STATUS_PARAM )
2445
{
2446
    flag aSign;
2447
    int_fast16_t aExp, zExp;
2448
    uint32_t aSig, zSig;
2449
    uint64_t rem, term;
2450
    a = float32_squash_input_denormal(a STATUS_VAR);
2451

    
2452
    aSig = extractFloat32Frac( a );
2453
    aExp = extractFloat32Exp( a );
2454
    aSign = extractFloat32Sign( a );
2455
    if ( aExp == 0xFF ) {
2456
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2457
        if ( ! aSign ) return a;
2458
        float_raise( float_flag_invalid STATUS_VAR);
2459
        return float32_default_nan;
2460
    }
2461
    if ( aSign ) {
2462
        if ( ( aExp | aSig ) == 0 ) return a;
2463
        float_raise( float_flag_invalid STATUS_VAR);
2464
        return float32_default_nan;
2465
    }
2466
    if ( aExp == 0 ) {
2467
        if ( aSig == 0 ) return float32_zero;
2468
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2469
    }
2470
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2471
    aSig = ( aSig | 0x00800000 )<<8;
2472
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2473
    if ( ( zSig & 0x7F ) <= 5 ) {
2474
        if ( zSig < 2 ) {
2475
            zSig = 0x7FFFFFFF;
2476
            goto roundAndPack;
2477
        }
2478
        aSig >>= aExp & 1;
2479
        term = ( (uint64_t) zSig ) * zSig;
2480
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2481
        while ( (int64_t) rem < 0 ) {
2482
            --zSig;
2483
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2484
        }
2485
        zSig |= ( rem != 0 );
2486
    }
2487
    shift32RightJamming( zSig, 1, &zSig );
2488
 roundAndPack:
2489
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2490

    
2491
}
2492

    
2493
/*----------------------------------------------------------------------------
2494
| Returns the binary exponential of the single-precision floating-point value
2495
| `a'. The operation is performed according to the IEC/IEEE Standard for
2496
| Binary Floating-Point Arithmetic.
2497
|
2498
| Uses the following identities:
2499
|
2500
| 1. -------------------------------------------------------------------------
2501
|      x    x*ln(2)
2502
|     2  = e
2503
|
2504
| 2. -------------------------------------------------------------------------
2505
|                      2     3     4     5           n
2506
|      x        x     x     x     x     x           x
2507
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2508
|               1!    2!    3!    4!    5!          n!
2509
*----------------------------------------------------------------------------*/
2510

    
2511
static const float64 float32_exp2_coefficients[15] =
2512
{
2513
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2514
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2515
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2516
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2517
    const_float64( 0x3f81111111111111ll ), /*  5 */
2518
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2519
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2520
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2521
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2522
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2523
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2524
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2525
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2526
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2527
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2528
};
2529

    
2530
float32 float32_exp2( float32 a STATUS_PARAM )
2531
{
2532
    flag aSign;
2533
    int_fast16_t aExp;
2534
    uint32_t aSig;
2535
    float64 r, x, xn;
2536
    int i;
2537
    a = float32_squash_input_denormal(a STATUS_VAR);
2538

    
2539
    aSig = extractFloat32Frac( a );
2540
    aExp = extractFloat32Exp( a );
2541
    aSign = extractFloat32Sign( a );
2542

    
2543
    if ( aExp == 0xFF) {
2544
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2545
        return (aSign) ? float32_zero : a;
2546
    }
2547
    if (aExp == 0) {
2548
        if (aSig == 0) return float32_one;
2549
    }
2550

    
2551
    float_raise( float_flag_inexact STATUS_VAR);
2552

    
2553
    /* ******************************* */
2554
    /* using float64 for approximation */
2555
    /* ******************************* */
2556
    x = float32_to_float64(a STATUS_VAR);
2557
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2558

    
2559
    xn = x;
2560
    r = float64_one;
2561
    for (i = 0 ; i < 15 ; i++) {
2562
        float64 f;
2563

    
2564
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2565
        r = float64_add(r, f STATUS_VAR);
2566

    
2567
        xn = float64_mul(xn, x STATUS_VAR);
2568
    }
2569

    
2570
    return float64_to_float32(r, status);
2571
}
2572

    
2573
/*----------------------------------------------------------------------------
2574
| Returns the binary log of the single-precision floating-point value `a'.
2575
| The operation is performed according to the IEC/IEEE Standard for Binary
2576
| Floating-Point Arithmetic.
2577
*----------------------------------------------------------------------------*/
2578
float32 float32_log2( float32 a STATUS_PARAM )
2579
{
2580
    flag aSign, zSign;
2581
    int_fast16_t aExp;
2582
    uint32_t aSig, zSig, i;
2583

    
2584
    a = float32_squash_input_denormal(a STATUS_VAR);
2585
    aSig = extractFloat32Frac( a );
2586
    aExp = extractFloat32Exp( a );
2587
    aSign = extractFloat32Sign( a );
2588

    
2589
    if ( aExp == 0 ) {
2590
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2591
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2592
    }
2593
    if ( aSign ) {
2594
        float_raise( float_flag_invalid STATUS_VAR);
2595
        return float32_default_nan;
2596
    }
2597
    if ( aExp == 0xFF ) {
2598
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2599
        return a;
2600
    }
2601

    
2602
    aExp -= 0x7F;
2603
    aSig |= 0x00800000;
2604
    zSign = aExp < 0;
2605
    zSig = aExp << 23;
2606

    
2607
    for (i = 1 << 22; i > 0; i >>= 1) {
2608
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2609
        if ( aSig & 0x01000000 ) {
2610
            aSig >>= 1;
2611
            zSig |= i;
2612
        }
2613
    }
2614

    
2615
    if ( zSign )
2616
        zSig = -zSig;
2617

    
2618
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2619
}
2620

    
2621
/*----------------------------------------------------------------------------
2622
| Returns 1 if the single-precision floating-point value `a' is equal to
2623
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2624
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2625
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2626
*----------------------------------------------------------------------------*/
2627

    
2628
int float32_eq( float32 a, float32 b STATUS_PARAM )
2629
{
2630
    uint32_t av, bv;
2631
    a = float32_squash_input_denormal(a STATUS_VAR);
2632
    b = float32_squash_input_denormal(b STATUS_VAR);
2633

    
2634
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2635
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2636
       ) {
2637
        float_raise( float_flag_invalid STATUS_VAR);
2638
        return 0;
2639
    }
2640
    av = float32_val(a);
2641
    bv = float32_val(b);
2642
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2643
}
2644

    
2645
/*----------------------------------------------------------------------------
2646
| Returns 1 if the single-precision floating-point value `a' is less than
2647
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2648
| exception is raised if either operand is a NaN.  The comparison is performed
2649
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2650
*----------------------------------------------------------------------------*/
2651

    
2652
int float32_le( float32 a, float32 b STATUS_PARAM )
2653
{
2654
    flag aSign, bSign;
2655
    uint32_t av, bv;
2656
    a = float32_squash_input_denormal(a STATUS_VAR);
2657
    b = float32_squash_input_denormal(b STATUS_VAR);
2658

    
2659
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2660
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2661
       ) {
2662
        float_raise( float_flag_invalid STATUS_VAR);
2663
        return 0;
2664
    }
2665
    aSign = extractFloat32Sign( a );
2666
    bSign = extractFloat32Sign( b );
2667
    av = float32_val(a);
2668
    bv = float32_val(b);
2669
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2670
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2671

    
2672
}
2673

    
2674
/*----------------------------------------------------------------------------
2675
| Returns 1 if the single-precision floating-point value `a' is less than
2676
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2677
| raised if either operand is a NaN.  The comparison is performed according
2678
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2679
*----------------------------------------------------------------------------*/
2680

    
2681
int float32_lt( float32 a, float32 b STATUS_PARAM )
2682
{
2683
    flag aSign, bSign;
2684
    uint32_t av, bv;
2685
    a = float32_squash_input_denormal(a STATUS_VAR);
2686
    b = float32_squash_input_denormal(b STATUS_VAR);
2687

    
2688
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2689
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2690
       ) {
2691
        float_raise( float_flag_invalid STATUS_VAR);
2692
        return 0;
2693
    }
2694
    aSign = extractFloat32Sign( a );
2695
    bSign = extractFloat32Sign( b );
2696
    av = float32_val(a);
2697
    bv = float32_val(b);
2698
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2699
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2700

    
2701
}
2702

    
2703
/*----------------------------------------------------------------------------
2704
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2705
| be compared, and 0 otherwise.  The invalid exception is raised if either
2706
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2707
| Standard for Binary Floating-Point Arithmetic.
2708
*----------------------------------------------------------------------------*/
2709

    
2710
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2711
{
2712
    a = float32_squash_input_denormal(a STATUS_VAR);
2713
    b = float32_squash_input_denormal(b STATUS_VAR);
2714

    
2715
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2716
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2717
       ) {
2718
        float_raise( float_flag_invalid STATUS_VAR);
2719
        return 1;
2720
    }
2721
    return 0;
2722
}
2723

    
2724
/*----------------------------------------------------------------------------
2725
| Returns 1 if the single-precision floating-point value `a' is equal to
2726
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2727
| exception.  The comparison is performed according to the IEC/IEEE Standard
2728
| for Binary Floating-Point Arithmetic.
2729
*----------------------------------------------------------------------------*/
2730

    
2731
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2732
{
2733
    a = float32_squash_input_denormal(a STATUS_VAR);
2734
    b = float32_squash_input_denormal(b STATUS_VAR);
2735

    
2736
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2737
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2738
       ) {
2739
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2740
            float_raise( float_flag_invalid STATUS_VAR);
2741
        }
2742
        return 0;
2743
    }
2744
    return ( float32_val(a) == float32_val(b) ) ||
2745
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2746
}
2747

    
2748
/*----------------------------------------------------------------------------
2749
| Returns 1 if the single-precision floating-point value `a' is less than or
2750
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2751
| cause an exception.  Otherwise, the comparison is performed according to the
2752
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2753
*----------------------------------------------------------------------------*/
2754

    
2755
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2756
{
2757
    flag aSign, bSign;
2758
    uint32_t av, bv;
2759
    a = float32_squash_input_denormal(a STATUS_VAR);
2760
    b = float32_squash_input_denormal(b STATUS_VAR);
2761

    
2762
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2763
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2764
       ) {
2765
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2766
            float_raise( float_flag_invalid STATUS_VAR);
2767
        }
2768
        return 0;
2769
    }
2770
    aSign = extractFloat32Sign( a );
2771
    bSign = extractFloat32Sign( b );
2772
    av = float32_val(a);
2773
    bv = float32_val(b);
2774
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2775
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2776

    
2777
}
2778

    
2779
/*----------------------------------------------------------------------------
2780
| Returns 1 if the single-precision floating-point value `a' is less than
2781
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2782
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2783
| Standard for Binary Floating-Point Arithmetic.
2784
*----------------------------------------------------------------------------*/
2785

    
2786
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2787
{
2788
    flag aSign, bSign;
2789
    uint32_t av, bv;
2790
    a = float32_squash_input_denormal(a STATUS_VAR);
2791
    b = float32_squash_input_denormal(b STATUS_VAR);
2792

    
2793
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2794
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2795
       ) {
2796
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2797
            float_raise( float_flag_invalid STATUS_VAR);
2798
        }
2799
        return 0;
2800
    }
2801
    aSign = extractFloat32Sign( a );
2802
    bSign = extractFloat32Sign( b );
2803
    av = float32_val(a);
2804
    bv = float32_val(b);
2805
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2806
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2807

    
2808
}
2809

    
2810
/*----------------------------------------------------------------------------
2811
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2812
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2813
| comparison is performed according to the IEC/IEEE Standard for Binary
2814
| Floating-Point Arithmetic.
2815
*----------------------------------------------------------------------------*/
2816

    
2817
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2818
{
2819
    a = float32_squash_input_denormal(a STATUS_VAR);
2820
    b = float32_squash_input_denormal(b STATUS_VAR);
2821

    
2822
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2823
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2824
       ) {
2825
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2826
            float_raise( float_flag_invalid STATUS_VAR);
2827
        }
2828
        return 1;
2829
    }
2830
    return 0;
2831
}
2832

    
2833
/*----------------------------------------------------------------------------
2834
| Returns the result of converting the double-precision floating-point value
2835
| `a' to the 32-bit two's complement integer format.  The conversion is
2836
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2837
| Arithmetic---which means in particular that the conversion is rounded
2838
| according to the current rounding mode.  If `a' is a NaN, the largest
2839
| positive integer is returned.  Otherwise, if the conversion overflows, the
2840
| largest integer with the same sign as `a' is returned.
2841
*----------------------------------------------------------------------------*/
2842

    
2843
int32 float64_to_int32( float64 a STATUS_PARAM )
2844
{
2845
    flag aSign;
2846
    int_fast16_t aExp, shiftCount;
2847
    uint64_t aSig;
2848
    a = float64_squash_input_denormal(a STATUS_VAR);
2849

    
2850
    aSig = extractFloat64Frac( a );
2851
    aExp = extractFloat64Exp( a );
2852
    aSign = extractFloat64Sign( a );
2853
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2854
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2855
    shiftCount = 0x42C - aExp;
2856
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2857
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2858

    
2859
}
2860

    
2861
/*----------------------------------------------------------------------------
2862
| Returns the result of converting the double-precision floating-point value
2863
| `a' to the 32-bit two's complement integer format.  The conversion is
2864
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2865
| Arithmetic, except that the conversion is always rounded toward zero.
2866
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2867
| the conversion overflows, the largest integer with the same sign as `a' is
2868
| returned.
2869
*----------------------------------------------------------------------------*/
2870

    
2871
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2872
{
2873
    flag aSign;
2874
    int_fast16_t aExp, shiftCount;
2875
    uint64_t aSig, savedASig;
2876
    int32_t z;
2877
    a = float64_squash_input_denormal(a STATUS_VAR);
2878

    
2879
    aSig = extractFloat64Frac( a );
2880
    aExp = extractFloat64Exp( a );
2881
    aSign = extractFloat64Sign( a );
2882
    if ( 0x41E < aExp ) {
2883
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2884
        goto invalid;
2885
    }
2886
    else if ( aExp < 0x3FF ) {
2887
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2888
        return 0;
2889
    }
2890
    aSig |= LIT64( 0x0010000000000000 );
2891
    shiftCount = 0x433 - aExp;
2892
    savedASig = aSig;
2893
    aSig >>= shiftCount;
2894
    z = aSig;
2895
    if ( aSign ) z = - z;
2896
    if ( ( z < 0 ) ^ aSign ) {
2897
 invalid:
2898
        float_raise( float_flag_invalid STATUS_VAR);
2899
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2900
    }
2901
    if ( ( aSig<<shiftCount ) != savedASig ) {
2902
        STATUS(float_exception_flags) |= float_flag_inexact;
2903
    }
2904
    return z;
2905

    
2906
}
2907

    
2908
/*----------------------------------------------------------------------------
2909
| Returns the result of converting the double-precision floating-point value
2910
| `a' to the 16-bit two's complement integer format.  The conversion is
2911
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2912
| Arithmetic, except that the conversion is always rounded toward zero.
2913
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2914
| the conversion overflows, the largest integer with the same sign as `a' is
2915
| returned.
2916
*----------------------------------------------------------------------------*/
2917

    
2918
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2919
{
2920
    flag aSign;
2921
    int_fast16_t aExp, shiftCount;
2922
    uint64_t aSig, savedASig;
2923
    int32 z;
2924

    
2925
    aSig = extractFloat64Frac( a );
2926
    aExp = extractFloat64Exp( a );
2927
    aSign = extractFloat64Sign( a );
2928
    if ( 0x40E < aExp ) {
2929
        if ( ( aExp == 0x7FF ) && aSig ) {
2930
            aSign = 0;
2931
        }
2932
        goto invalid;
2933
    }
2934
    else if ( aExp < 0x3FF ) {
2935
        if ( aExp || aSig ) {
2936
            STATUS(float_exception_flags) |= float_flag_inexact;
2937
        }
2938
        return 0;
2939
    }
2940
    aSig |= LIT64( 0x0010000000000000 );
2941
    shiftCount = 0x433 - aExp;
2942
    savedASig = aSig;
2943
    aSig >>= shiftCount;
2944
    z = aSig;
2945
    if ( aSign ) {
2946
        z = - z;
2947
    }
2948
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2949
 invalid:
2950
        float_raise( float_flag_invalid STATUS_VAR);
2951
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2952
    }
2953
    if ( ( aSig<<shiftCount ) != savedASig ) {
2954
        STATUS(float_exception_flags) |= float_flag_inexact;
2955
    }
2956
    return z;
2957
}
2958

    
2959
/*----------------------------------------------------------------------------
2960
| Returns the result of converting the double-precision floating-point value
2961
| `a' to the 64-bit two's complement integer format.  The conversion is
2962
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963
| Arithmetic---which means in particular that the conversion is rounded
2964
| according to the current rounding mode.  If `a' is a NaN, the largest
2965
| positive integer is returned.  Otherwise, if the conversion overflows, the
2966
| largest integer with the same sign as `a' is returned.
2967
*----------------------------------------------------------------------------*/
2968

    
2969
int64 float64_to_int64( float64 a STATUS_PARAM )
2970
{
2971
    flag aSign;
2972
    int_fast16_t aExp, shiftCount;
2973
    uint64_t aSig, aSigExtra;
2974
    a = float64_squash_input_denormal(a STATUS_VAR);
2975

    
2976
    aSig = extractFloat64Frac( a );
2977
    aExp = extractFloat64Exp( a );
2978
    aSign = extractFloat64Sign( a );
2979
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2980
    shiftCount = 0x433 - aExp;
2981
    if ( shiftCount <= 0 ) {
2982
        if ( 0x43E < aExp ) {
2983
            float_raise( float_flag_invalid STATUS_VAR);
2984
            if (    ! aSign
2985
                 || (    ( aExp == 0x7FF )
2986
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2987
               ) {
2988
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2989
            }
2990
            return (int64_t) LIT64( 0x8000000000000000 );
2991
        }
2992
        aSigExtra = 0;
2993
        aSig <<= - shiftCount;
2994
    }
2995
    else {
2996
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2997
    }
2998
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2999

    
3000
}
3001

    
3002
/*----------------------------------------------------------------------------
3003
| Returns the result of converting the double-precision floating-point value
3004
| `a' to the 64-bit two's complement integer format.  The conversion is
3005
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3006
| Arithmetic, except that the conversion is always rounded toward zero.
3007
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3008
| the conversion overflows, the largest integer with the same sign as `a' is
3009
| returned.
3010
*----------------------------------------------------------------------------*/
3011

    
3012
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3013
{
3014
    flag aSign;
3015
    int_fast16_t aExp, shiftCount;
3016
    uint64_t aSig;
3017
    int64 z;
3018
    a = float64_squash_input_denormal(a STATUS_VAR);
3019

    
3020
    aSig = extractFloat64Frac( a );
3021
    aExp = extractFloat64Exp( a );
3022
    aSign = extractFloat64Sign( a );
3023
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3024
    shiftCount = aExp - 0x433;
3025
    if ( 0 <= shiftCount ) {
3026
        if ( 0x43E <= aExp ) {
3027
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3028
                float_raise( float_flag_invalid STATUS_VAR);
3029
                if (    ! aSign
3030
                     || (    ( aExp == 0x7FF )
3031
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3032
                   ) {
3033
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3034
                }
3035
            }
3036
            return (int64_t) LIT64( 0x8000000000000000 );
3037
        }
3038
        z = aSig<<shiftCount;
3039
    }
3040
    else {
3041
        if ( aExp < 0x3FE ) {
3042
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3043
            return 0;
3044
        }
3045
        z = aSig>>( - shiftCount );
3046
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3047
            STATUS(float_exception_flags) |= float_flag_inexact;
3048
        }
3049
    }
3050
    if ( aSign ) z = - z;
3051
    return z;
3052

    
3053
}
3054

    
3055
/*----------------------------------------------------------------------------
3056
| Returns the result of converting the double-precision floating-point value
3057
| `a' to the single-precision floating-point format.  The conversion is
3058
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3059
| Arithmetic.
3060
*----------------------------------------------------------------------------*/
3061

    
3062
float32 float64_to_float32( float64 a STATUS_PARAM )
3063
{
3064
    flag aSign;
3065
    int_fast16_t aExp;
3066
    uint64_t aSig;
3067
    uint32_t zSig;
3068
    a = float64_squash_input_denormal(a STATUS_VAR);
3069

    
3070
    aSig = extractFloat64Frac( a );
3071
    aExp = extractFloat64Exp( a );
3072
    aSign = extractFloat64Sign( a );
3073
    if ( aExp == 0x7FF ) {
3074
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3075
        return packFloat32( aSign, 0xFF, 0 );
3076
    }
3077
    shift64RightJamming( aSig, 22, &aSig );
3078
    zSig = aSig;
3079
    if ( aExp || zSig ) {
3080
        zSig |= 0x40000000;
3081
        aExp -= 0x381;
3082
    }
3083
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3084

    
3085
}
3086

    
3087

    
3088
/*----------------------------------------------------------------------------
3089
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3090
| half-precision floating-point value, returning the result.  After being
3091
| shifted into the proper positions, the three fields are simply added
3092
| together to form the result.  This means that any integer portion of `zSig'
3093
| will be added into the exponent.  Since a properly normalized significand
3094
| will have an integer portion equal to 1, the `zExp' input should be 1 less
3095
| than the desired result exponent whenever `zSig' is a complete, normalized
3096
| significand.
3097
*----------------------------------------------------------------------------*/
3098
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3099
{
3100
    return make_float16(
3101
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3102
}
3103

    
3104
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3105
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3106

    
3107
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3108
{
3109
    flag aSign;
3110
    int_fast16_t aExp;
3111
    uint32_t aSig;
3112

    
3113
    aSign = extractFloat16Sign(a);
3114
    aExp = extractFloat16Exp(a);
3115
    aSig = extractFloat16Frac(a);
3116

    
3117
    if (aExp == 0x1f && ieee) {
3118
        if (aSig) {
3119
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3120
        }
3121
        return packFloat32(aSign, 0xff, 0);
3122
    }
3123
    if (aExp == 0) {
3124
        int8 shiftCount;
3125

    
3126
        if (aSig == 0) {
3127
            return packFloat32(aSign, 0, 0);
3128
        }
3129

    
3130
        shiftCount = countLeadingZeros32( aSig ) - 21;
3131
        aSig = aSig << shiftCount;
3132
        aExp = -shiftCount;
3133
    }
3134
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3135
}
3136

    
3137
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3138
{
3139
    flag aSign;
3140
    int_fast16_t aExp;
3141
    uint32_t aSig;
3142
    uint32_t mask;
3143
    uint32_t increment;
3144
    int8 roundingMode;
3145
    int maxexp = ieee ? 15 : 16;
3146
    bool rounding_bumps_exp;
3147
    bool is_tiny = false;
3148

    
3149
    a = float32_squash_input_denormal(a STATUS_VAR);
3150

    
3151
    aSig = extractFloat32Frac( a );
3152
    aExp = extractFloat32Exp( a );
3153
    aSign = extractFloat32Sign( a );
3154
    if ( aExp == 0xFF ) {
3155
        if (aSig) {
3156
            /* Input is a NaN */
3157
            if (!ieee) {
3158
                float_raise(float_flag_invalid STATUS_VAR);
3159
                return packFloat16(aSign, 0, 0);
3160
            }
3161
            return commonNaNToFloat16(
3162
                float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3163
        }
3164
        /* Infinity */
3165
        if (!ieee) {
3166
            float_raise(float_flag_invalid STATUS_VAR);
3167
            return packFloat16(aSign, 0x1f, 0x3ff);
3168
        }
3169
        return packFloat16(aSign, 0x1f, 0);
3170
    }
3171
    if (aExp == 0 && aSig == 0) {
3172
        return packFloat16(aSign, 0, 0);
3173
    }
3174
    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3175
     * even if the input is denormal; however this is harmless because
3176
     * the largest possible single-precision denormal is still smaller
3177
     * than the smallest representable half-precision denormal, and so we
3178
     * will end up ignoring aSig and returning via the "always return zero"
3179
     * codepath.
3180
     */
3181
    aSig |= 0x00800000;
3182
    aExp -= 0x7f;
3183
    /* Calculate the mask of bits of the mantissa which are not
3184
     * representable in half-precision and will be lost.
3185
     */
3186
    if (aExp < -14) {
3187
        /* Will be denormal in halfprec */
3188
        mask = 0x00ffffff;
3189
        if (aExp >= -24) {
3190
            mask >>= 25 + aExp;
3191
        }
3192
    } else {
3193
        /* Normal number in halfprec */
3194
        mask = 0x00001fff;
3195
    }
3196

    
3197
    roundingMode = STATUS(float_rounding_mode);
3198
    switch (roundingMode) {
3199
    case float_round_nearest_even:
3200
        increment = (mask + 1) >> 1;
3201
        if ((aSig & mask) == increment) {
3202
            increment = aSig & (increment << 1);
3203
        }
3204
        break;
3205
    case float_round_up:
3206
        increment = aSign ? 0 : mask;
3207
        break;
3208
    case float_round_down:
3209
        increment = aSign ? mask : 0;
3210
        break;
3211
    default: /* round_to_zero */
3212
        increment = 0;
3213
        break;
3214
    }
3215

    
3216
    rounding_bumps_exp = (aSig + increment >= 0x01000000);
3217

    
3218
    if (aExp > maxexp || (aExp == maxexp && rounding_bumps_exp)) {
3219
        if (ieee) {
3220
            float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3221
            return packFloat16(aSign, 0x1f, 0);
3222
        } else {
3223
            float_raise(float_flag_invalid STATUS_VAR);
3224
            return packFloat16(aSign, 0x1f, 0x3ff);
3225
        }
3226
    }
3227

    
3228
    if (aExp < -14) {
3229
        /* Note that flush-to-zero does not affect half-precision results */
3230
        is_tiny =
3231
            (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3232
            || (aExp < -15)
3233
            || (!rounding_bumps_exp);
3234
    }
3235
    if (aSig & mask) {
3236
        float_raise(float_flag_inexact STATUS_VAR);
3237
        if (is_tiny) {
3238
            float_raise(float_flag_underflow STATUS_VAR);
3239
        }
3240
    }
3241

    
3242
    aSig += increment;
3243
    if (rounding_bumps_exp) {
3244
        aSig >>= 1;
3245
        aExp++;
3246
    }
3247

    
3248
    if (aExp < -24) {
3249
        return packFloat16(aSign, 0, 0);
3250
    }
3251
    if (aExp < -14) {
3252
        aSig >>= -14 - aExp;
3253
        aExp = -14;
3254
    }
3255
    return packFloat16(aSign, aExp + 14, aSig >> 13);
3256
}
3257

    
3258
/*----------------------------------------------------------------------------
3259
| Returns the result of converting the double-precision floating-point value
3260
| `a' to the extended double-precision floating-point format.  The conversion
3261
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3262
| Arithmetic.
3263
*----------------------------------------------------------------------------*/
3264

    
3265
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3266
{
3267
    flag aSign;
3268
    int_fast16_t aExp;
3269
    uint64_t aSig;
3270

    
3271
    a = float64_squash_input_denormal(a STATUS_VAR);
3272
    aSig = extractFloat64Frac( a );
3273
    aExp = extractFloat64Exp( a );
3274
    aSign = extractFloat64Sign( a );
3275
    if ( aExp == 0x7FF ) {
3276
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3277
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3278
    }
3279
    if ( aExp == 0 ) {
3280
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3281
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3282
    }
3283
    return
3284
        packFloatx80(
3285
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3286

    
3287
}
3288

    
3289
/*----------------------------------------------------------------------------
3290
| Returns the result of converting the double-precision floating-point value
3291
| `a' to the quadruple-precision floating-point format.  The conversion is
3292
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3293
| Arithmetic.
3294
*----------------------------------------------------------------------------*/
3295

    
3296
float128 float64_to_float128( float64 a STATUS_PARAM )
3297
{
3298
    flag aSign;
3299
    int_fast16_t aExp;
3300
    uint64_t aSig, zSig0, zSig1;
3301

    
3302
    a = float64_squash_input_denormal(a STATUS_VAR);
3303
    aSig = extractFloat64Frac( a );
3304
    aExp = extractFloat64Exp( a );
3305
    aSign = extractFloat64Sign( a );
3306
    if ( aExp == 0x7FF ) {
3307
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3308
        return packFloat128( aSign, 0x7FFF, 0, 0 );
3309
    }
3310
    if ( aExp == 0 ) {
3311
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3312
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3313
        --aExp;
3314
    }
3315
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3316
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3317

    
3318
}
3319

    
3320
/*----------------------------------------------------------------------------
3321
| Rounds the double-precision floating-point value `a' to an integer, and
3322
| returns the result as a double-precision floating-point value.  The
3323
| operation is performed according to the IEC/IEEE Standard for Binary
3324
| Floating-Point Arithmetic.
3325
*----------------------------------------------------------------------------*/
3326

    
3327
float64 float64_round_to_int( float64 a STATUS_PARAM )
3328
{
3329
    flag aSign;
3330
    int_fast16_t aExp;
3331
    uint64_t lastBitMask, roundBitsMask;
3332
    int8 roundingMode;
3333
    uint64_t z;
3334
    a = float64_squash_input_denormal(a STATUS_VAR);
3335

    
3336
    aExp = extractFloat64Exp( a );
3337
    if ( 0x433 <= aExp ) {
3338
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3339
            return propagateFloat64NaN( a, a STATUS_VAR );
3340
        }
3341
        return a;
3342
    }
3343
    if ( aExp < 0x3FF ) {
3344
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3345
        STATUS(float_exception_flags) |= float_flag_inexact;
3346
        aSign = extractFloat64Sign( a );
3347
        switch ( STATUS(float_rounding_mode) ) {
3348
         case float_round_nearest_even:
3349
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3350
                return packFloat64( aSign, 0x3FF, 0 );
3351
            }
3352
            break;
3353
         case float_round_down:
3354
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3355
         case float_round_up:
3356
            return make_float64(
3357
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3358
        }
3359
        return packFloat64( aSign, 0, 0 );
3360
    }
3361
    lastBitMask = 1;
3362
    lastBitMask <<= 0x433 - aExp;
3363
    roundBitsMask = lastBitMask - 1;
3364
    z = float64_val(a);
3365
    roundingMode = STATUS(float_rounding_mode);
3366
    if ( roundingMode == float_round_nearest_even ) {
3367
        z += lastBitMask>>1;
3368
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3369
    }
3370
    else if ( roundingMode != float_round_to_zero ) {
3371
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3372
            z += roundBitsMask;
3373
        }
3374
    }
3375
    z &= ~ roundBitsMask;
3376
    if ( z != float64_val(a) )
3377
        STATUS(float_exception_flags) |= float_flag_inexact;
3378
    return make_float64(z);
3379

    
3380
}
3381

    
3382
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3383
{
3384
    int oldmode;
3385
    float64 res;
3386
    oldmode = STATUS(float_rounding_mode);
3387
    STATUS(float_rounding_mode) = float_round_to_zero;
3388
    res = float64_round_to_int(a STATUS_VAR);
3389
    STATUS(float_rounding_mode) = oldmode;
3390
    return res;
3391
}
3392

    
3393
/*----------------------------------------------------------------------------
3394
| Returns the result of adding the absolute values of the double-precision
3395
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3396
| before being returned.  `zSign' is ignored if the result is a NaN.
3397
| The addition is performed according to the IEC/IEEE Standard for Binary
3398
| Floating-Point Arithmetic.
3399
*----------------------------------------------------------------------------*/
3400

    
3401
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3402
{
3403
    int_fast16_t aExp, bExp, zExp;
3404
    uint64_t aSig, bSig, zSig;
3405
    int_fast16_t expDiff;
3406

    
3407
    aSig = extractFloat64Frac( a );
3408
    aExp = extractFloat64Exp( a );
3409
    bSig = extractFloat64Frac( b );
3410
    bExp = extractFloat64Exp( b );
3411
    expDiff = aExp - bExp;
3412
    aSig <<= 9;
3413
    bSig <<= 9;
3414
    if ( 0 < expDiff ) {
3415
        if ( aExp == 0x7FF ) {
3416
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3417
            return a;
3418
        }
3419
        if ( bExp == 0 ) {
3420
            --expDiff;
3421
        }
3422
        else {
3423
            bSig |= LIT64( 0x2000000000000000 );
3424
        }
3425
        shift64RightJamming( bSig, expDiff, &bSig );
3426
        zExp = aExp;
3427
    }
3428
    else if ( expDiff < 0 ) {
3429
        if ( bExp == 0x7FF ) {
3430
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3431
            return packFloat64( zSign, 0x7FF, 0 );
3432
        }
3433
        if ( aExp == 0 ) {
3434
            ++expDiff;
3435
        }
3436
        else {
3437
            aSig |= LIT64( 0x2000000000000000 );
3438
        }
3439
        shift64RightJamming( aSig, - expDiff, &aSig );
3440
        zExp = bExp;
3441
    }
3442
    else {
3443
        if ( aExp == 0x7FF ) {
3444
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3445
            return a;
3446
        }
3447
        if ( aExp == 0 ) {
3448
            if (STATUS(flush_to_zero)) {
3449
                if (aSig | bSig) {
3450
                    float_raise(float_flag_output_denormal STATUS_VAR);
3451
                }
3452
                return packFloat64(zSign, 0, 0);
3453
            }
3454
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3455
        }
3456
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3457
        zExp = aExp;
3458
        goto roundAndPack;
3459
    }
3460
    aSig |= LIT64( 0x2000000000000000 );
3461
    zSig = ( aSig + bSig )<<1;
3462
    --zExp;
3463
    if ( (int64_t) zSig < 0 ) {
3464
        zSig = aSig + bSig;
3465
        ++zExp;
3466
    }
3467
 roundAndPack:
3468
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3469

    
3470
}
3471

    
3472
/*----------------------------------------------------------------------------
3473
| Returns the result of subtracting the absolute values of the double-
3474
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3475
| difference is negated before being returned.  `zSign' is ignored if the
3476
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3477
| Standard for Binary Floating-Point Arithmetic.
3478
*----------------------------------------------------------------------------*/
3479

    
3480
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3481
{
3482
    int_fast16_t aExp, bExp, zExp;
3483
    uint64_t aSig, bSig, zSig;
3484
    int_fast16_t expDiff;
3485

    
3486
    aSig = extractFloat64Frac( a );
3487
    aExp = extractFloat64Exp( a );
3488
    bSig = extractFloat64Frac( b );
3489
    bExp = extractFloat64Exp( b );
3490
    expDiff = aExp - bExp;
3491
    aSig <<= 10;
3492
    bSig <<= 10;
3493
    if ( 0 < expDiff ) goto aExpBigger;
3494
    if ( expDiff < 0 ) goto bExpBigger;
3495
    if ( aExp == 0x7FF ) {
3496
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3497
        float_raise( float_flag_invalid STATUS_VAR);
3498
        return float64_default_nan;
3499
    }
3500
    if ( aExp == 0 ) {
3501
        aExp = 1;
3502
        bExp = 1;
3503
    }
3504
    if ( bSig < aSig ) goto aBigger;
3505
    if ( aSig < bSig ) goto bBigger;
3506
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3507
 bExpBigger:
3508
    if ( bExp == 0x7FF ) {
3509
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3510
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3511
    }
3512
    if ( aExp == 0 ) {
3513
        ++expDiff;
3514
    }
3515
    else {
3516
        aSig |= LIT64( 0x4000000000000000 );
3517
    }
3518
    shift64RightJamming( aSig, - expDiff, &aSig );
3519
    bSig |= LIT64( 0x4000000000000000 );
3520
 bBigger:
3521
    zSig = bSig - aSig;
3522
    zExp = bExp;
3523
    zSign ^= 1;
3524
    goto normalizeRoundAndPack;
3525
 aExpBigger:
3526
    if ( aExp == 0x7FF ) {
3527
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3528
        return a;
3529
    }
3530
    if ( bExp == 0 ) {
3531
        --expDiff;
3532
    }
3533
    else {
3534
        bSig |= LIT64( 0x4000000000000000 );
3535
    }
3536
    shift64RightJamming( bSig, expDiff, &bSig );
3537
    aSig |= LIT64( 0x4000000000000000 );
3538
 aBigger:
3539
    zSig = aSig - bSig;
3540
    zExp = aExp;
3541
 normalizeRoundAndPack:
3542
    --zExp;
3543
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3544

    
3545
}
3546

    
3547
/*----------------------------------------------------------------------------
3548
| Returns the result of adding the double-precision floating-point values `a'
3549
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3550
| Binary Floating-Point Arithmetic.
3551
*----------------------------------------------------------------------------*/
3552

    
3553
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3554
{
3555
    flag aSign, bSign;
3556
    a = float64_squash_input_denormal(a STATUS_VAR);
3557
    b = float64_squash_input_denormal(b STATUS_VAR);
3558

    
3559
    aSign = extractFloat64Sign( a );
3560
    bSign = extractFloat64Sign( b );
3561
    if ( aSign == bSign ) {
3562
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3563
    }
3564
    else {
3565
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3566
    }
3567

    
3568
}
3569

    
3570
/*----------------------------------------------------------------------------
3571
| Returns the result of subtracting the double-precision floating-point values
3572
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3573
| for Binary Floating-Point Arithmetic.
3574
*----------------------------------------------------------------------------*/
3575

    
3576
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3577
{
3578
    flag aSign, bSign;
3579
    a = float64_squash_input_denormal(a STATUS_VAR);
3580
    b = float64_squash_input_denormal(b STATUS_VAR);
3581

    
3582
    aSign = extractFloat64Sign( a );
3583
    bSign = extractFloat64Sign( b );
3584
    if ( aSign == bSign ) {
3585
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3586
    }
3587
    else {
3588
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3589
    }
3590

    
3591
}
3592

    
3593
/*----------------------------------------------------------------------------
3594
| Returns the result of multiplying the double-precision floating-point values
3595
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3596
| for Binary Floating-Point Arithmetic.
3597
*----------------------------------------------------------------------------*/
3598

    
3599
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3600
{
3601
    flag aSign, bSign, zSign;
3602
    int_fast16_t aExp, bExp, zExp;
3603
    uint64_t aSig, bSig, zSig0, zSig1;
3604

    
3605
    a = float64_squash_input_denormal(a STATUS_VAR);
3606
    b = float64_squash_input_denormal(b STATUS_VAR);
3607

    
3608
    aSig = extractFloat64Frac( a );
3609
    aExp = extractFloat64Exp( a );
3610
    aSign = extractFloat64Sign( a );
3611
    bSig = extractFloat64Frac( b );
3612
    bExp = extractFloat64Exp( b );
3613
    bSign = extractFloat64Sign( b );
3614
    zSign = aSign ^ bSign;
3615
    if ( aExp == 0x7FF ) {
3616
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3617
            return propagateFloat64NaN( a, b STATUS_VAR );
3618
        }
3619
        if ( ( bExp | bSig ) == 0 ) {
3620
            float_raise( float_flag_invalid STATUS_VAR);
3621
            return float64_default_nan;
3622
        }
3623
        return packFloat64( zSign, 0x7FF, 0 );
3624
    }
3625
    if ( bExp == 0x7FF ) {
3626
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3627
        if ( ( aExp | aSig ) == 0 ) {
3628
            float_raise( float_flag_invalid STATUS_VAR);
3629
            return float64_default_nan;
3630
        }
3631
        return packFloat64( zSign, 0x7FF, 0 );
3632
    }
3633
    if ( aExp == 0 ) {
3634
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3635
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3636
    }
3637
    if ( bExp == 0 ) {
3638
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3639
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3640
    }
3641
    zExp = aExp + bExp - 0x3FF;
3642
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3643
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3644
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3645
    zSig0 |= ( zSig1 != 0 );
3646
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3647
        zSig0 <<= 1;
3648
        --zExp;
3649
    }
3650
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3651

    
3652
}
3653

    
3654
/*----------------------------------------------------------------------------
3655
| Returns the result of dividing the double-precision floating-point value `a'
3656
| by the corresponding value `b'.  The operation is performed according to
3657
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3658
*----------------------------------------------------------------------------*/
3659

    
3660
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3661
{
3662
    flag aSign, bSign, zSign;
3663
    int_fast16_t aExp, bExp, zExp;
3664
    uint64_t aSig, bSig, zSig;
3665
    uint64_t rem0, rem1;
3666
    uint64_t term0, term1;
3667
    a = float64_squash_input_denormal(a STATUS_VAR);
3668
    b = float64_squash_input_denormal(b STATUS_VAR);
3669

    
3670
    aSig = extractFloat64Frac( a );
3671
    aExp = extractFloat64Exp( a );
3672
    aSign = extractFloat64Sign( a );
3673
    bSig = extractFloat64Frac( b );
3674
    bExp = extractFloat64Exp( b );
3675
    bSign = extractFloat64Sign( b );
3676
    zSign = aSign ^ bSign;
3677
    if ( aExp == 0x7FF ) {
3678
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3679
        if ( bExp == 0x7FF ) {
3680
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3681
            float_raise( float_flag_invalid STATUS_VAR);
3682
            return float64_default_nan;
3683
        }
3684
        return packFloat64( zSign, 0x7FF, 0 );
3685
    }
3686
    if ( bExp == 0x7FF ) {
3687
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3688
        return packFloat64( zSign, 0, 0 );
3689
    }
3690
    if ( bExp == 0 ) {
3691
        if ( bSig == 0 ) {
3692
            if ( ( aExp | aSig ) == 0 ) {
3693
                float_raise( float_flag_invalid STATUS_VAR);
3694
                return float64_default_nan;
3695
            }
3696
            float_raise( float_flag_divbyzero STATUS_VAR);
3697
            return packFloat64( zSign, 0x7FF, 0 );
3698
        }
3699
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3700
    }
3701
    if ( aExp == 0 ) {
3702
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3703
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3704
    }
3705
    zExp = aExp - bExp + 0x3FD;
3706
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3707
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3708
    if ( bSig <= ( aSig + aSig ) ) {
3709
        aSig >>= 1;
3710
        ++zExp;
3711
    }
3712
    zSig = estimateDiv128To64( aSig, 0, bSig );
3713
    if ( ( zSig & 0x1FF ) <= 2 ) {
3714
        mul64To128( bSig, zSig, &term0, &term1 );
3715
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3716
        while ( (int64_t) rem0 < 0 ) {
3717
            --zSig;
3718
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3719
        }
3720
        zSig |= ( rem1 != 0 );
3721
    }
3722
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3723

    
3724
}
3725

    
3726
/*----------------------------------------------------------------------------
3727
| Returns the remainder of the double-precision floating-point value `a'
3728
| with respect to the corresponding value `b'.  The operation is performed
3729
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3730
*----------------------------------------------------------------------------*/
3731

    
3732
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3733
{
3734
    flag aSign, zSign;
3735
    int_fast16_t aExp, bExp, expDiff;
3736
    uint64_t aSig, bSig;
3737
    uint64_t q, alternateASig;
3738
    int64_t sigMean;
3739

    
3740
    a = float64_squash_input_denormal(a STATUS_VAR);
3741
    b = float64_squash_input_denormal(b STATUS_VAR);
3742
    aSig = extractFloat64Frac( a );
3743
    aExp = extractFloat64Exp( a );
3744
    aSign = extractFloat64Sign( a );
3745
    bSig = extractFloat64Frac( b );
3746
    bExp = extractFloat64Exp( b );
3747
    if ( aExp == 0x7FF ) {
3748
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3749
            return propagateFloat64NaN( a, b STATUS_VAR );
3750
        }
3751
        float_raise( float_flag_invalid STATUS_VAR);
3752
        return float64_default_nan;
3753
    }
3754
    if ( bExp == 0x7FF ) {
3755
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3756
        return a;
3757
    }
3758
    if ( bExp == 0 ) {
3759
        if ( bSig == 0 ) {
3760
            float_raise( float_flag_invalid STATUS_VAR);
3761
            return float64_default_nan;
3762
        }
3763
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3764
    }
3765
    if ( aExp == 0 ) {
3766
        if ( aSig == 0 ) return a;
3767
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3768
    }
3769
    expDiff = aExp - bExp;
3770
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3771
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3772
    if ( expDiff < 0 ) {
3773
        if ( expDiff < -1 ) return a;
3774
        aSig >>= 1;
3775
    }
3776
    q = ( bSig <= aSig );
3777
    if ( q ) aSig -= bSig;
3778
    expDiff -= 64;
3779
    while ( 0 < expDiff ) {
3780
        q = estimateDiv128To64( aSig, 0, bSig );
3781
        q = ( 2 < q ) ? q - 2 : 0;
3782
        aSig = - ( ( bSig>>2 ) * q );
3783
        expDiff -= 62;
3784
    }
3785
    expDiff += 64;
3786
    if ( 0 < expDiff ) {
3787
        q = estimateDiv128To64( aSig, 0, bSig );
3788
        q = ( 2 < q ) ? q - 2 : 0;
3789
        q >>= 64 - expDiff;
3790
        bSig >>= 2;
3791
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3792
    }
3793
    else {
3794
        aSig >>= 2;
3795
        bSig >>= 2;
3796
    }
3797
    do {
3798
        alternateASig = aSig;
3799
        ++q;
3800
        aSig -= bSig;
3801
    } while ( 0 <= (int64_t) aSig );
3802
    sigMean = aSig + alternateASig;
3803
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3804
        aSig = alternateASig;
3805
    }
3806
    zSign = ( (int64_t) aSig < 0 );
3807
    if ( zSign ) aSig = - aSig;
3808
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3809

    
3810
}
3811

    
3812
/*----------------------------------------------------------------------------
3813
| Returns the result of multiplying the double-precision floating-point values
3814
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3815
| multiplication.  The operation is performed according to the IEC/IEEE
3816
| Standard for Binary Floating-Point Arithmetic 754-2008.
3817
| The flags argument allows the caller to select negation of the
3818
| addend, the intermediate product, or the final result. (The difference
3819
| between this and having the caller do a separate negation is that negating
3820
| externally will flip the sign bit on NaNs.)
3821
*----------------------------------------------------------------------------*/
3822

    
3823
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3824
{
3825
    flag aSign, bSign, cSign, zSign;
3826
    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3827
    uint64_t aSig, bSig, cSig;
3828
    flag pInf, pZero, pSign;
3829
    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3830
    int shiftcount;
3831
    flag signflip, infzero;
3832

    
3833
    a = float64_squash_input_denormal(a STATUS_VAR);
3834
    b = float64_squash_input_denormal(b STATUS_VAR);
3835
    c = float64_squash_input_denormal(c STATUS_VAR);
3836
    aSig = extractFloat64Frac(a);
3837
    aExp = extractFloat64Exp(a);
3838
    aSign = extractFloat64Sign(a);
3839
    bSig = extractFloat64Frac(b);
3840
    bExp = extractFloat64Exp(b);
3841
    bSign = extractFloat64Sign(b);
3842
    cSig = extractFloat64Frac(c);
3843
    cExp = extractFloat64Exp(c);
3844
    cSign = extractFloat64Sign(c);
3845

    
3846
    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3847
               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3848

    
3849
    /* It is implementation-defined whether the cases of (0,inf,qnan)
3850
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3851
     * they return if they do), so we have to hand this information
3852
     * off to the target-specific pick-a-NaN routine.
3853
     */
3854
    if (((aExp == 0x7ff) && aSig) ||
3855
        ((bExp == 0x7ff) && bSig) ||
3856
        ((cExp == 0x7ff) && cSig)) {
3857
        return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3858
    }
3859

    
3860
    if (infzero) {
3861
        float_raise(float_flag_invalid STATUS_VAR);
3862
        return float64_default_nan;
3863
    }
3864

    
3865
    if (flags & float_muladd_negate_c) {
3866
        cSign ^= 1;
3867
    }
3868

    
3869
    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3870

    
3871
    /* Work out the sign and type of the product */
3872
    pSign = aSign ^ bSign;
3873
    if (flags & float_muladd_negate_product) {
3874
        pSign ^= 1;
3875
    }
3876
    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3877
    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3878

    
3879
    if (cExp == 0x7ff) {
3880
        if (pInf && (pSign ^ cSign)) {
3881
            /* addition of opposite-signed infinities => InvalidOperation */
3882
            float_raise(float_flag_invalid STATUS_VAR);
3883
            return float64_default_nan;
3884
        }
3885
        /* Otherwise generate an infinity of the same sign */
3886
        return packFloat64(cSign ^ signflip, 0x7ff, 0);
3887
    }
3888

    
3889
    if (pInf) {
3890
        return packFloat64(pSign ^ signflip, 0x7ff, 0);
3891
    }
3892

    
3893
    if (pZero) {
3894
        if (cExp == 0) {
3895
            if (cSig == 0) {
3896
                /* Adding two exact zeroes */
3897
                if (pSign == cSign) {
3898
                    zSign = pSign;
3899
                } else if (STATUS(float_rounding_mode) == float_round_down) {
3900
                    zSign = 1;
3901
                } else {
3902
                    zSign = 0;
3903
                }
3904
                return packFloat64(zSign ^ signflip, 0, 0);
3905
            }
3906
            /* Exact zero plus a denorm */
3907
            if (STATUS(flush_to_zero)) {
3908
                float_raise(float_flag_output_denormal STATUS_VAR);
3909
                return packFloat64(cSign ^ signflip, 0, 0);
3910
            }
3911
        }
3912
        /* Zero plus something non-zero : just return the something */
3913
        return packFloat64(cSign ^ signflip, cExp, cSig);
3914
    }
3915

    
3916
    if (aExp == 0) {
3917
        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3918
    }
3919
    if (bExp == 0) {
3920
        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3921
    }
3922

    
3923
    /* Calculate the actual result a * b + c */
3924

    
3925
    /* Multiply first; this is easy. */
3926
    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3927
     * because we want the true exponent, not the "one-less-than"
3928
     * flavour that roundAndPackFloat64() takes.
3929
     */
3930
    pExp = aExp + bExp - 0x3fe;
3931
    aSig = (aSig | LIT64(0x0010000000000000))<<10;
3932
    bSig = (bSig | LIT64(0x0010000000000000))<<11;
3933
    mul64To128(aSig, bSig, &pSig0, &pSig1);
3934
    if ((int64_t)(pSig0 << 1) >= 0) {
3935
        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3936
        pExp--;
3937
    }
3938

    
3939
    zSign = pSign ^ signflip;
3940

    
3941
    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3942
     * bit in position 126.
3943
     */
3944
    if (cExp == 0) {
3945
        if (!cSig) {
3946
            /* Throw out the special case of c being an exact zero now */
3947
            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3948
            return roundAndPackFloat64(zSign, pExp - 1,
3949
                                       pSig1 STATUS_VAR);
3950
        }
3951
        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3952
    }
3953

    
3954
    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3955
     * significand of the addend, with the explicit bit in position 126.
3956
     */
3957
    cSig0 = cSig << (126 - 64 - 52);
3958
    cSig1 = 0;
3959
    cSig0 |= LIT64(0x4000000000000000);
3960
    expDiff = pExp - cExp;
3961

    
3962
    if (pSign == cSign) {
3963
        /* Addition */
3964
        if (expDiff > 0) {
3965
            /* scale c to match p */
3966
            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3967
            zExp = pExp;
3968
        } else if (expDiff < 0) {
3969
            /* scale p to match c */
3970
            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3971
            zExp = cExp;
3972
        } else {
3973
            /* no scaling needed */
3974
            zExp = cExp;
3975
        }
3976
        /* Add significands and make sure explicit bit ends up in posn 126 */
3977
        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3978
        if ((int64_t)zSig0 < 0) {
3979
            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3980
        } else {
3981
            zExp--;
3982
        }
3983
        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3984
        return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3985
    } else {
3986
        /* Subtraction */
3987
        if (expDiff > 0) {
3988
            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3989
            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3990
            zExp = pExp;
3991
        } else if (expDiff < 0) {
3992
            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3993
            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3994
            zExp = cExp;
3995
            zSign ^= 1;
3996
        } else {
3997
            zExp = pExp;
3998
            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3999
                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4000
            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4001
                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4002
                zSign ^= 1;
4003
            } else {
4004
                /* Exact zero */
4005
                zSign = signflip;
4006
                if (STATUS(float_rounding_mode) == float_round_down) {
4007
                    zSign ^= 1;
4008
                }
4009
                return packFloat64(zSign, 0, 0);
4010
            }
4011
        }
4012
        --zExp;
4013
        /* Do the equivalent of normalizeRoundAndPackFloat64() but
4014
         * starting with the significand in a pair of uint64_t.
4015
         */
4016
        if (zSig0) {
4017
            shiftcount = countLeadingZeros64(zSig0) - 1;
4018
            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4019
            if (zSig1) {
4020
                zSig0 |= 1;
4021
            }
4022
            zExp -= shiftcount;
4023
        } else {
4024
            shiftcount = countLeadingZeros64(zSig1);
4025
            if (shiftcount == 0) {
4026
                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4027
                zExp -= 63;
4028
            } else {
4029
                shiftcount--;
4030
                zSig0 = zSig1 << shiftcount;
4031
                zExp -= (shiftcount + 64);
4032
            }
4033
        }
4034
        return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4035
    }
4036
}
4037

    
4038
/*----------------------------------------------------------------------------
4039
| Returns the square root of the double-precision floating-point value `a'.
4040
| The operation is performed according to the IEC/IEEE Standard for Binary
4041
| Floating-Point Arithmetic.
4042
*----------------------------------------------------------------------------*/
4043

    
4044
float64 float64_sqrt( float64 a STATUS_PARAM )
4045
{
4046
    flag aSign;
4047
    int_fast16_t aExp, zExp;
4048
    uint64_t aSig, zSig, doubleZSig;
4049
    uint64_t rem0, rem1, term0, term1;
4050
    a = float64_squash_input_denormal(a STATUS_VAR);
4051

    
4052
    aSig = extractFloat64Frac( a );
4053
    aExp = extractFloat64Exp( a );
4054
    aSign = extractFloat64Sign( a );
4055
    if ( aExp == 0x7FF ) {
4056
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4057
        if ( ! aSign ) return a;
4058
        float_raise( float_flag_invalid STATUS_VAR);
4059
        return float64_default_nan;
4060
    }
4061
    if ( aSign ) {
4062
        if ( ( aExp | aSig ) == 0 ) return a;
4063
        float_raise( float_flag_invalid STATUS_VAR);
4064
        return float64_default_nan;
4065
    }
4066
    if ( aExp == 0 ) {
4067
        if ( aSig == 0 ) return float64_zero;
4068
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4069
    }
4070
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4071
    aSig |= LIT64( 0x0010000000000000 );
4072
    zSig = estimateSqrt32( aExp, aSig>>21 );
4073
    aSig <<= 9 - ( aExp & 1 );
4074
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4075
    if ( ( zSig & 0x1FF ) <= 5 ) {
4076
        doubleZSig = zSig<<1;
4077
        mul64To128( zSig, zSig, &term0, &term1 );
4078
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4079
        while ( (int64_t) rem0 < 0 ) {
4080
            --zSig;
4081
            doubleZSig -= 2;
4082
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4083
        }
4084
        zSig |= ( ( rem0 | rem1 ) != 0 );
4085
    }
4086
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4087

    
4088
}
4089

    
4090
/*----------------------------------------------------------------------------
4091
| Returns the binary log of the double-precision floating-point value `a'.
4092
| The operation is performed according to the IEC/IEEE Standard for Binary
4093
| Floating-Point Arithmetic.
4094
*----------------------------------------------------------------------------*/
4095
float64 float64_log2( float64 a STATUS_PARAM )
4096
{
4097
    flag aSign, zSign;
4098
    int_fast16_t aExp;
4099
    uint64_t aSig, aSig0, aSig1, zSig, i;
4100
    a = float64_squash_input_denormal(a STATUS_VAR);
4101

    
4102
    aSig = extractFloat64Frac( a );
4103
    aExp = extractFloat64Exp( a );
4104
    aSign = extractFloat64Sign( a );
4105

    
4106
    if ( aExp == 0 ) {
4107
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4108
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4109
    }
4110
    if ( aSign ) {
4111
        float_raise( float_flag_invalid STATUS_VAR);
4112
        return float64_default_nan;
4113
    }
4114
    if ( aExp == 0x7FF ) {
4115
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4116
        return a;
4117
    }
4118

    
4119
    aExp -= 0x3FF;
4120
    aSig |= LIT64( 0x0010000000000000 );
4121
    zSign = aExp < 0;
4122
    zSig = (uint64_t)aExp << 52;
4123
    for (i = 1LL << 51; i > 0; i >>= 1) {
4124
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4125
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4126
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4127
            aSig >>= 1;
4128
            zSig |= i;
4129
        }
4130
    }
4131

    
4132
    if ( zSign )
4133
        zSig = -zSig;
4134
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4135
}
4136

    
4137
/*----------------------------------------------------------------------------
4138
| Returns 1 if the double-precision floating-point value `a' is equal to the
4139
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4140
| if either operand is a NaN.  Otherwise, the comparison is performed
4141
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4142
*----------------------------------------------------------------------------*/
4143

    
4144
int float64_eq( float64 a, float64 b STATUS_PARAM )
4145
{
4146
    uint64_t av, bv;
4147
    a = float64_squash_input_denormal(a STATUS_VAR);
4148
    b = float64_squash_input_denormal(b STATUS_VAR);
4149

    
4150
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4151
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4152
       ) {
4153
        float_raise( float_flag_invalid STATUS_VAR);
4154
        return 0;
4155
    }
4156
    av = float64_val(a);
4157
    bv = float64_val(b);
4158
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4159

    
4160
}
4161

    
4162
/*----------------------------------------------------------------------------
4163
| Returns 1 if the double-precision floating-point value `a' is less than or
4164
| equal to the corresponding value `b', and 0 otherwise.  The invalid
4165
| exception is raised if either operand is a NaN.  The comparison is performed
4166
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4167
*----------------------------------------------------------------------------*/
4168

    
4169
int float64_le( float64 a, float64 b STATUS_PARAM )
4170
{
4171
    flag aSign, bSign;
4172
    uint64_t av, bv;
4173
    a = float64_squash_input_denormal(a STATUS_VAR);
4174
    b = float64_squash_input_denormal(b STATUS_VAR);
4175

    
4176
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4177
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4178
       ) {
4179
        float_raise( float_flag_invalid STATUS_VAR);
4180
        return 0;
4181
    }
4182
    aSign = extractFloat64Sign( a );
4183
    bSign = extractFloat64Sign( b );
4184
    av = float64_val(a);
4185
    bv = float64_val(b);
4186
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4187
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4188

    
4189
}
4190

    
4191
/*----------------------------------------------------------------------------
4192
| Returns 1 if the double-precision floating-point value `a' is less than
4193
| the corresponding value `b', and 0 otherwise.  The invalid exception is
4194
| raised if either operand is a NaN.  The comparison is performed according
4195
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4196
*----------------------------------------------------------------------------*/
4197

    
4198
int float64_lt( float64 a, float64 b STATUS_PARAM )
4199
{
4200
    flag aSign, bSign;
4201
    uint64_t av, bv;
4202

    
4203
    a = float64_squash_input_denormal(a STATUS_VAR);
4204
    b = float64_squash_input_denormal(b STATUS_VAR);
4205
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4206
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4207
       ) {
4208
        float_raise( float_flag_invalid STATUS_VAR);
4209
        return 0;
4210
    }
4211
    aSign = extractFloat64Sign( a );
4212
    bSign = extractFloat64Sign( b );
4213
    av = float64_val(a);
4214
    bv = float64_val(b);
4215
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4216
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4217

    
4218
}
4219

    
4220
/*----------------------------------------------------------------------------
4221
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4222
| be compared, and 0 otherwise.  The invalid exception is raised if either
4223
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4224
| Standard for Binary Floating-Point Arithmetic.
4225
*----------------------------------------------------------------------------*/
4226

    
4227
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4228
{
4229
    a = float64_squash_input_denormal(a STATUS_VAR);
4230
    b = float64_squash_input_denormal(b STATUS_VAR);
4231

    
4232
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4233
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4234
       ) {
4235
        float_raise( float_flag_invalid STATUS_VAR);
4236
        return 1;
4237
    }
4238
    return 0;
4239
}
4240

    
4241
/*----------------------------------------------------------------------------
4242
| Returns 1 if the double-precision floating-point value `a' is equal to the
4243
| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4244
| exception.The comparison is performed according to the IEC/IEEE Standard
4245
| for Binary Floating-Point Arithmetic.
4246
*----------------------------------------------------------------------------*/
4247

    
4248
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4249
{
4250
    uint64_t av, bv;
4251
    a = float64_squash_input_denormal(a STATUS_VAR);
4252
    b = float64_squash_input_denormal(b STATUS_VAR);
4253

    
4254
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4255
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4256
       ) {
4257
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4258
            float_raise( float_flag_invalid STATUS_VAR);
4259
        }
4260
        return 0;
4261
    }
4262
    av = float64_val(a);
4263
    bv = float64_val(b);
4264
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4265

    
4266
}
4267

    
4268
/*----------------------------------------------------------------------------
4269
| Returns 1 if the double-precision floating-point value `a' is less than or
4270
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4271
| cause an exception.  Otherwise, the comparison is performed according to the
4272
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4273
*----------------------------------------------------------------------------*/
4274

    
4275
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4276
{
4277
    flag aSign, bSign;
4278
    uint64_t av, bv;
4279
    a = float64_squash_input_denormal(a STATUS_VAR);
4280
    b = float64_squash_input_denormal(b STATUS_VAR);
4281

    
4282
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4283
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4284
       ) {
4285
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4286
            float_raise( float_flag_invalid STATUS_VAR);
4287
        }
4288
        return 0;
4289
    }
4290
    aSign = extractFloat64Sign( a );
4291
    bSign = extractFloat64Sign( b );
4292
    av = float64_val(a);
4293
    bv = float64_val(b);
4294
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4295
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4296

    
4297
}
4298

    
4299
/*----------------------------------------------------------------------------
4300
| Returns 1 if the double-precision floating-point value `a' is less than
4301
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4302
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4303
| Standard for Binary Floating-Point Arithmetic.
4304
*----------------------------------------------------------------------------*/
4305

    
4306
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4307
{
4308
    flag aSign, bSign;
4309
    uint64_t av, bv;
4310
    a = float64_squash_input_denormal(a STATUS_VAR);
4311
    b = float64_squash_input_denormal(b STATUS_VAR);
4312

    
4313
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4314
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4315
       ) {
4316
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4317
            float_raise( float_flag_invalid STATUS_VAR);
4318
        }
4319
        return 0;
4320
    }
4321
    aSign = extractFloat64Sign( a );
4322
    bSign = extractFloat64Sign( b );
4323
    av = float64_val(a);
4324
    bv = float64_val(b);
4325
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4326
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4327

    
4328
}
4329

    
4330
/*----------------------------------------------------------------------------
4331
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4332
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4333
| comparison is performed according to the IEC/IEEE Standard for Binary
4334
| Floating-Point Arithmetic.
4335
*----------------------------------------------------------------------------*/
4336

    
4337
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4338
{
4339
    a = float64_squash_input_denormal(a STATUS_VAR);
4340
    b = float64_squash_input_denormal(b STATUS_VAR);
4341

    
4342
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4343
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4344
       ) {
4345
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4346
            float_raise( float_flag_invalid STATUS_VAR);
4347
        }
4348
        return 1;
4349
    }
4350
    return 0;
4351
}
4352

    
4353
/*----------------------------------------------------------------------------
4354
| Returns the result of converting the extended double-precision floating-
4355
| point value `a' to the 32-bit two's complement integer format.  The
4356
| conversion is performed according to the IEC/IEEE Standard for Binary
4357
| Floating-Point Arithmetic---which means in particular that the conversion
4358
| is rounded according to the current rounding mode.  If `a' is a NaN, the
4359
| largest positive integer is returned.  Otherwise, if the conversion
4360
| overflows, the largest integer with the same sign as `a' is returned.
4361
*----------------------------------------------------------------------------*/
4362

    
4363
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4364
{
4365
    flag aSign;
4366
    int32 aExp, shiftCount;
4367
    uint64_t aSig;
4368

    
4369
    aSig = extractFloatx80Frac( a );
4370
    aExp = extractFloatx80Exp( a );
4371
    aSign = extractFloatx80Sign( a );
4372
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4373
    shiftCount = 0x4037 - aExp;
4374
    if ( shiftCount <= 0 ) shiftCount = 1;
4375
    shift64RightJamming( aSig, shiftCount, &aSig );
4376
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
4377

    
4378
}
4379

    
4380
/*----------------------------------------------------------------------------
4381
| Returns the result of converting the extended double-precision floating-
4382
| point value `a' to the 32-bit two's complement integer format.  The
4383
| conversion is performed according to the IEC/IEEE Standard for Binary
4384
| Floating-Point Arithmetic, except that the conversion is always rounded
4385
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4386
| Otherwise, if the conversion overflows, the largest integer with the same
4387
| sign as `a' is returned.
4388
*----------------------------------------------------------------------------*/
4389

    
4390
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4391
{
4392
    flag aSign;
4393
    int32 aExp, shiftCount;
4394
    uint64_t aSig, savedASig;
4395
    int32_t z;
4396

    
4397
    aSig = extractFloatx80Frac( a );
4398
    aExp = extractFloatx80Exp( a );
4399
    aSign = extractFloatx80Sign( a );
4400
    if ( 0x401E < aExp ) {
4401
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4402
        goto invalid;
4403
    }
4404
    else if ( aExp < 0x3FFF ) {
4405
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4406
        return 0;
4407
    }
4408
    shiftCount = 0x403E - aExp;
4409
    savedASig = aSig;
4410
    aSig >>= shiftCount;
4411
    z = aSig;
4412
    if ( aSign ) z = - z;
4413
    if ( ( z < 0 ) ^ aSign ) {
4414
 invalid:
4415
        float_raise( float_flag_invalid STATUS_VAR);
4416
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4417
    }
4418
    if ( ( aSig<<shiftCount ) != savedASig ) {
4419
        STATUS(float_exception_flags) |= float_flag_inexact;
4420
    }
4421
    return z;
4422

    
4423
}
4424

    
4425
/*----------------------------------------------------------------------------
4426
| Returns the result of converting the extended double-precision floating-
4427
| point value `a' to the 64-bit two's complement integer format.  The
4428
| conversion is performed according to the IEC/IEEE Standard for Binary
4429
| Floating-Point Arithmetic---which means in particular that the conversion
4430
| is rounded according to the current rounding mode.  If `a' is a NaN,
4431
| the largest positive integer is returned.  Otherwise, if the conversion
4432
| overflows, the largest integer with the same sign as `a' is returned.
4433
*----------------------------------------------------------------------------*/
4434

    
4435
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4436
{
4437
    flag aSign;
4438
    int32 aExp, shiftCount;
4439
    uint64_t aSig, aSigExtra;
4440

    
4441
    aSig = extractFloatx80Frac( a );
4442
    aExp = extractFloatx80Exp( a );
4443
    aSign = extractFloatx80Sign( a );
4444
    shiftCount = 0x403E - aExp;
4445
    if ( shiftCount <= 0 ) {
4446
        if ( shiftCount ) {
4447
            float_raise( float_flag_invalid STATUS_VAR);
4448
            if (    ! aSign
4449
                 || (    ( aExp == 0x7FFF )
4450
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4451
               ) {
4452
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4453
            }
4454
            return (int64_t) LIT64( 0x8000000000000000 );
4455
        }
4456
        aSigExtra = 0;
4457
    }
4458
    else {
4459
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4460
    }
4461
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4462

    
4463
}
4464

    
4465
/*----------------------------------------------------------------------------
4466
| Returns the result of converting the extended double-precision floating-
4467
| point value `a' to the 64-bit two's complement integer format.  The
4468
| conversion is performed according to the IEC/IEEE Standard for Binary
4469
| Floating-Point Arithmetic, except that the conversion is always rounded
4470
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4471
| Otherwise, if the conversion overflows, the largest integer with the same
4472
| sign as `a' is returned.
4473
*----------------------------------------------------------------------------*/
4474

    
4475
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4476
{
4477
    flag aSign;
4478
    int32 aExp, shiftCount;
4479
    uint64_t aSig;
4480
    int64 z;
4481

    
4482
    aSig = extractFloatx80Frac( a );
4483
    aExp = extractFloatx80Exp( a );
4484
    aSign = extractFloatx80Sign( a );
4485
    shiftCount = aExp - 0x403E;
4486
    if ( 0 <= shiftCount ) {
4487
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4488
        if ( ( a.high != 0xC03E ) || aSig ) {
4489
            float_raise( float_flag_invalid STATUS_VAR);
4490
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4491
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4492
            }
4493
        }
4494
        return (int64_t) LIT64( 0x8000000000000000 );
4495
    }
4496
    else if ( aExp < 0x3FFF ) {
4497
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4498
        return 0;
4499
    }
4500
    z = aSig>>( - shiftCount );
4501
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4502
        STATUS(float_exception_flags) |= float_flag_inexact;
4503
    }
4504
    if ( aSign ) z = - z;
4505
    return z;
4506

    
4507
}
4508

    
4509
/*----------------------------------------------------------------------------
4510
| Returns the result of converting the extended double-precision floating-
4511
| point value `a' to the single-precision floating-point format.  The
4512
| conversion is performed according to the IEC/IEEE Standard for Binary
4513
| Floating-Point Arithmetic.
4514
*----------------------------------------------------------------------------*/
4515

    
4516
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4517
{
4518
    flag aSign;
4519
    int32 aExp;
4520
    uint64_t aSig;
4521

    
4522
    aSig = extractFloatx80Frac( a );
4523
    aExp = extractFloatx80Exp( a );
4524
    aSign = extractFloatx80Sign( a );
4525
    if ( aExp == 0x7FFF ) {
4526
        if ( (uint64_t) ( aSig<<1 ) ) {
4527
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4528
        }
4529
        return packFloat32( aSign, 0xFF, 0 );
4530
    }
4531
    shift64RightJamming( aSig, 33, &aSig );
4532
    if ( aExp || aSig ) aExp -= 0x3F81;
4533
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4534

    
4535
}
4536

    
4537
/*----------------------------------------------------------------------------
4538
| Returns the result of converting the extended double-precision floating-
4539
| point value `a' to the double-precision floating-point format.  The
4540
| conversion is performed according to the IEC/IEEE Standard for Binary
4541
| Floating-Point Arithmetic.
4542
*----------------------------------------------------------------------------*/
4543

    
4544
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4545
{
4546
    flag aSign;
4547
    int32 aExp;
4548
    uint64_t aSig, zSig;
4549

    
4550
    aSig = extractFloatx80Frac( a );
4551
    aExp = extractFloatx80Exp( a );
4552
    aSign = extractFloatx80Sign( a );
4553
    if ( aExp == 0x7FFF ) {
4554
        if ( (uint64_t) ( aSig<<1 ) ) {
4555
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4556
        }
4557
        return packFloat64( aSign, 0x7FF, 0 );
4558
    }
4559
    shift64RightJamming( aSig, 1, &zSig );
4560
    if ( aExp || aSig ) aExp -= 0x3C01;
4561
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4562

    
4563
}
4564

    
4565
/*----------------------------------------------------------------------------
4566
| Returns the result of converting the extended double-precision floating-
4567
| point value `a' to the quadruple-precision floating-point format.  The
4568
| conversion is performed according to the IEC/IEEE Standard for Binary
4569
| Floating-Point Arithmetic.
4570
*----------------------------------------------------------------------------*/
4571

    
4572
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4573
{
4574
    flag aSign;
4575
    int_fast16_t aExp;
4576
    uint64_t aSig, zSig0, zSig1;
4577

    
4578
    aSig = extractFloatx80Frac( a );
4579
    aExp = extractFloatx80Exp( a );
4580
    aSign = extractFloatx80Sign( a );
4581
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4582
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4583
    }
4584
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4585
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4586

    
4587
}
4588

    
4589
/*----------------------------------------------------------------------------
4590
| Rounds the extended double-precision floating-point value `a' to an integer,
4591
| and returns the result as an extended quadruple-precision floating-point
4592
| value.  The operation is performed according to the IEC/IEEE Standard for
4593
| Binary Floating-Point Arithmetic.
4594
*----------------------------------------------------------------------------*/
4595

    
4596
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4597
{
4598
    flag aSign;
4599
    int32 aExp;
4600
    uint64_t lastBitMask, roundBitsMask;
4601
    int8 roundingMode;
4602
    floatx80 z;
4603

    
4604
    aExp = extractFloatx80Exp( a );
4605
    if ( 0x403E <= aExp ) {
4606
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4607
            return propagateFloatx80NaN( a, a STATUS_VAR );
4608
        }
4609
        return a;
4610
    }
4611
    if ( aExp < 0x3FFF ) {
4612
        if (    ( aExp == 0 )
4613
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4614
            return a;
4615
        }
4616
        STATUS(float_exception_flags) |= float_flag_inexact;
4617
        aSign = extractFloatx80Sign( a );
4618
        switch ( STATUS(float_rounding_mode) ) {
4619
         case float_round_nearest_even:
4620
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4621
               ) {
4622
                return
4623
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4624
            }
4625
            break;
4626
         case float_round_down:
4627
            return
4628
                  aSign ?
4629
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4630
                : packFloatx80( 0, 0, 0 );
4631
         case float_round_up:
4632
            return
4633
                  aSign ? packFloatx80( 1, 0, 0 )
4634
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4635
        }
4636
        return packFloatx80( aSign, 0, 0 );
4637
    }
4638
    lastBitMask = 1;
4639
    lastBitMask <<= 0x403E - aExp;
4640
    roundBitsMask = lastBitMask - 1;
4641
    z = a;
4642
    roundingMode = STATUS(float_rounding_mode);
4643
    if ( roundingMode == float_round_nearest_even ) {
4644
        z.low += lastBitMask>>1;
4645
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4646
    }
4647
    else if ( roundingMode != float_round_to_zero ) {
4648
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4649
            z.low += roundBitsMask;
4650
        }
4651
    }
4652
    z.low &= ~ roundBitsMask;
4653
    if ( z.low == 0 ) {
4654
        ++z.high;
4655
        z.low = LIT64( 0x8000000000000000 );
4656
    }
4657
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4658
    return z;
4659

    
4660
}
4661

    
4662
/*----------------------------------------------------------------------------
4663
| Returns the result of adding the absolute values of the extended double-
4664
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4665
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4666
| The addition is performed according to the IEC/IEEE Standard for Binary
4667
| Floating-Point Arithmetic.
4668
*----------------------------------------------------------------------------*/
4669

    
4670
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4671
{
4672
    int32 aExp, bExp, zExp;
4673
    uint64_t aSig, bSig, zSig0, zSig1;
4674
    int32 expDiff;
4675

    
4676
    aSig = extractFloatx80Frac( a );
4677
    aExp = extractFloatx80Exp( a );
4678
    bSig = extractFloatx80Frac( b );
4679
    bExp = extractFloatx80Exp( b );
4680
    expDiff = aExp - bExp;
4681
    if ( 0 < expDiff ) {
4682
        if ( aExp == 0x7FFF ) {
4683
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4684
            return a;
4685
        }
4686
        if ( bExp == 0 ) --expDiff;
4687
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4688
        zExp = aExp;
4689
    }
4690
    else if ( expDiff < 0 ) {
4691
        if ( bExp == 0x7FFF ) {
4692
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4693
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4694
        }
4695
        if ( aExp == 0 ) ++expDiff;
4696
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4697
        zExp = bExp;
4698
    }
4699
    else {
4700
        if ( aExp == 0x7FFF ) {
4701
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4702
                return propagateFloatx80NaN( a, b STATUS_VAR );
4703
            }
4704
            return a;
4705
        }
4706
        zSig1 = 0;
4707
        zSig0 = aSig + bSig;
4708
        if ( aExp == 0 ) {
4709
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4710
            goto roundAndPack;
4711
        }
4712
        zExp = aExp;
4713
        goto shiftRight1;
4714
    }
4715
    zSig0 = aSig + bSig;
4716
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4717
 shiftRight1:
4718
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4719
    zSig0 |= LIT64( 0x8000000000000000 );
4720
    ++zExp;
4721
 roundAndPack:
4722
    return
4723
        roundAndPackFloatx80(
4724
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4725

    
4726
}
4727

    
4728
/*----------------------------------------------------------------------------
4729
| Returns the result of subtracting the absolute values of the extended
4730
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4731
| difference is negated before being returned.  `zSign' is ignored if the
4732
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4733
| Standard for Binary Floating-Point Arithmetic.
4734
*----------------------------------------------------------------------------*/
4735

    
4736
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4737
{
4738
    int32 aExp, bExp, zExp;
4739
    uint64_t aSig, bSig, zSig0, zSig1;
4740
    int32 expDiff;
4741
    floatx80 z;
4742

    
4743
    aSig = extractFloatx80Frac( a );
4744
    aExp = extractFloatx80Exp( a );
4745
    bSig = extractFloatx80Frac( b );
4746
    bExp = extractFloatx80Exp( b );
4747
    expDiff = aExp - bExp;
4748
    if ( 0 < expDiff ) goto aExpBigger;
4749
    if ( expDiff < 0 ) goto bExpBigger;
4750
    if ( aExp == 0x7FFF ) {
4751
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4752
            return propagateFloatx80NaN( a, b STATUS_VAR );
4753
        }
4754
        float_raise( float_flag_invalid STATUS_VAR);
4755
        z.low = floatx80_default_nan_low;
4756
        z.high = floatx80_default_nan_high;
4757
        return z;
4758
    }
4759
    if ( aExp == 0 ) {
4760
        aExp = 1;
4761
        bExp = 1;
4762
    }
4763
    zSig1 = 0;
4764
    if ( bSig < aSig ) goto aBigger;
4765
    if ( aSig < bSig ) goto bBigger;
4766
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4767
 bExpBigger:
4768
    if ( bExp == 0x7FFF ) {
4769
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4770
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4771
    }
4772
    if ( aExp == 0 ) ++expDiff;
4773
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4774
 bBigger:
4775
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4776
    zExp = bExp;
4777
    zSign ^= 1;
4778
    goto normalizeRoundAndPack;
4779
 aExpBigger:
4780
    if ( aExp == 0x7FFF ) {
4781
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4782
        return a;
4783
    }
4784
    if ( bExp == 0 ) --expDiff;
4785
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4786
 aBigger:
4787
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4788
    zExp = aExp;
4789
 normalizeRoundAndPack:
4790
    return
4791
        normalizeRoundAndPackFloatx80(
4792
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4793

    
4794
}
4795

    
4796
/*----------------------------------------------------------------------------
4797
| Returns the result of adding the extended double-precision floating-point
4798
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4799
| Standard for Binary Floating-Point Arithmetic.
4800
*----------------------------------------------------------------------------*/
4801

    
4802
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4803
{
4804
    flag aSign, bSign;
4805

    
4806
    aSign = extractFloatx80Sign( a );
4807
    bSign = extractFloatx80Sign( b );
4808
    if ( aSign == bSign ) {
4809
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4810
    }
4811
    else {
4812
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4813
    }
4814

    
4815
}
4816

    
4817
/*----------------------------------------------------------------------------
4818
| Returns the result of subtracting the extended double-precision floating-
4819
| point values `a' and `b'.  The operation is performed according to the
4820
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821
*----------------------------------------------------------------------------*/
4822

    
4823
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4824
{
4825
    flag aSign, bSign;
4826

    
4827
    aSign = extractFloatx80Sign( a );
4828
    bSign = extractFloatx80Sign( b );
4829
    if ( aSign == bSign ) {
4830
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4831
    }
4832
    else {
4833
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4834
    }
4835

    
4836
}
4837

    
4838
/*----------------------------------------------------------------------------
4839
| Returns the result of multiplying the extended double-precision floating-
4840
| point values `a' and `b'.  The operation is performed according to the
4841
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842
*----------------------------------------------------------------------------*/
4843

    
4844
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4845
{
4846
    flag aSign, bSign, zSign;
4847
    int32 aExp, bExp, zExp;
4848
    uint64_t aSig, bSig, zSig0, zSig1;
4849
    floatx80 z;
4850

    
4851
    aSig = extractFloatx80Frac( a );
4852
    aExp = extractFloatx80Exp( a );
4853
    aSign = extractFloatx80Sign( a );
4854
    bSig = extractFloatx80Frac( b );
4855
    bExp = extractFloatx80Exp( b );
4856
    bSign = extractFloatx80Sign( b );
4857
    zSign = aSign ^ bSign;
4858
    if ( aExp == 0x7FFF ) {
4859
        if (    (uint64_t) ( aSig<<1 )
4860
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4861
            return propagateFloatx80NaN( a, b STATUS_VAR );
4862
        }
4863
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4864
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4865
    }
4866
    if ( bExp == 0x7FFF ) {
4867
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4868
        if ( ( aExp | aSig ) == 0 ) {
4869
 invalid:
4870
            float_raise( float_flag_invalid STATUS_VAR);
4871
            z.low = floatx80_default_nan_low;
4872
            z.high = floatx80_default_nan_high;
4873
            return z;
4874
        }
4875
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4876
    }
4877
    if ( aExp == 0 ) {
4878
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4879
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4880
    }
4881
    if ( bExp == 0 ) {
4882
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4883
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4884
    }
4885
    zExp = aExp + bExp - 0x3FFE;
4886
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4887
    if ( 0 < (int64_t) zSig0 ) {
4888
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4889
        --zExp;
4890
    }
4891
    return
4892
        roundAndPackFloatx80(
4893
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4894

    
4895
}
4896

    
4897
/*----------------------------------------------------------------------------
4898
| Returns the result of dividing the extended double-precision floating-point
4899
| value `a' by the corresponding value `b'.  The operation is performed
4900
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4901
*----------------------------------------------------------------------------*/
4902

    
4903
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4904
{
4905
    flag aSign, bSign, zSign;
4906
    int32 aExp, bExp, zExp;
4907
    uint64_t aSig, bSig, zSig0, zSig1;
4908
    uint64_t rem0, rem1, rem2, term0, term1, term2;
4909
    floatx80 z;
4910

    
4911
    aSig = extractFloatx80Frac( a );
4912
    aExp = extractFloatx80Exp( a );
4913
    aSign = extractFloatx80Sign( a );
4914
    bSig = extractFloatx80Frac( b );
4915
    bExp = extractFloatx80Exp( b );
4916
    bSign = extractFloatx80Sign( b );
4917
    zSign = aSign ^ bSign;
4918
    if ( aExp == 0x7FFF ) {
4919
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4920
        if ( bExp == 0x7FFF ) {
4921
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4922
            goto invalid;
4923
        }
4924
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4925
    }
4926
    if ( bExp == 0x7FFF ) {
4927
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4928
        return packFloatx80( zSign, 0, 0 );
4929
    }
4930
    if ( bExp == 0 ) {
4931
        if ( bSig == 0 ) {
4932
            if ( ( aExp | aSig ) == 0 ) {
4933
 invalid:
4934
                float_raise( float_flag_invalid STATUS_VAR);
4935
                z.low = floatx80_default_nan_low;
4936
                z.high = floatx80_default_nan_high;
4937
                return z;
4938
            }
4939
            float_raise( float_flag_divbyzero STATUS_VAR);
4940
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4941
        }
4942
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4943
    }
4944
    if ( aExp == 0 ) {
4945
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4946
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4947
    }
4948
    zExp = aExp - bExp + 0x3FFE;
4949
    rem1 = 0;
4950
    if ( bSig <= aSig ) {
4951
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4952
        ++zExp;
4953
    }
4954
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4955
    mul64To128( bSig, zSig0, &term0, &term1 );
4956
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4957
    while ( (int64_t) rem0 < 0 ) {
4958
        --zSig0;
4959
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4960
    }
4961
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4962
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4963
        mul64To128( bSig, zSig1, &term1, &term2 );
4964
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4965
        while ( (int64_t) rem1 < 0 ) {
4966
            --zSig1;
4967
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4968
        }
4969
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4970
    }
4971
    return
4972
        roundAndPackFloatx80(
4973
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4974

    
4975
}
4976

    
4977
/*----------------------------------------------------------------------------
4978
| Returns the remainder of the extended double-precision floating-point value
4979
| `a' with respect to the corresponding value `b'.  The operation is performed
4980
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4981
*----------------------------------------------------------------------------*/
4982

    
4983
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4984
{
4985
    flag aSign, zSign;
4986
    int32 aExp, bExp, expDiff;
4987
    uint64_t aSig0, aSig1, bSig;
4988
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4989
    floatx80 z;
4990

    
4991
    aSig0 = extractFloatx80Frac( a );
4992
    aExp = extractFloatx80Exp( a );
4993
    aSign = extractFloatx80Sign( a );
4994
    bSig = extractFloatx80Frac( b );
4995
    bExp = extractFloatx80Exp( b );
4996
    if ( aExp == 0x7FFF ) {
4997
        if (    (uint64_t) ( aSig0<<1 )
4998
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4999
            return propagateFloatx80NaN( a, b STATUS_VAR );
5000
        }
5001
        goto invalid;
5002
    }
5003
    if ( bExp == 0x7FFF ) {
5004
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5005
        return a;
5006
    }
5007
    if ( bExp == 0 ) {
5008
        if ( bSig == 0 ) {
5009
 invalid:
5010
            float_raise( float_flag_invalid STATUS_VAR);
5011
            z.low = floatx80_default_nan_low;
5012
            z.high = floatx80_default_nan_high;
5013
            return z;
5014
        }
5015
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5016
    }
5017
    if ( aExp == 0 ) {
5018
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5019
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5020
    }
5021
    bSig |= LIT64( 0x8000000000000000 );
5022
    zSign = aSign;
5023
    expDiff = aExp - bExp;
5024
    aSig1 = 0;
5025
    if ( expDiff < 0 ) {
5026
        if ( expDiff < -1 ) return a;
5027
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5028
        expDiff = 0;
5029
    }
5030
    q = ( bSig <= aSig0 );
5031
    if ( q ) aSig0 -= bSig;
5032
    expDiff -= 64;
5033
    while ( 0 < expDiff ) {
5034
        q = estimateDiv128To64( aSig0, aSig1, bSig );
5035
        q = ( 2 < q ) ? q - 2 : 0;
5036
        mul64To128( bSig, q, &term0, &term1 );
5037
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5038
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5039
        expDiff -= 62;
5040
    }
5041
    expDiff += 64;
5042
    if ( 0 < expDiff ) {
5043
        q = estimateDiv128To64( aSig0, aSig1, bSig );
5044
        q = ( 2 < q ) ? q - 2 : 0;
5045
        q >>= 64 - expDiff;
5046
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5047
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5048
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5049
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5050
            ++q;
5051
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5052
        }
5053
    }
5054
    else {
5055
        term1 = 0;
5056
        term0 = bSig;
5057
    }
5058
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5059
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5060
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5061
              && ( q & 1 ) )
5062
       ) {
5063
        aSig0 = alternateASig0;
5064
        aSig1 = alternateASig1;
5065
        zSign = ! zSign;
5066
    }
5067
    return
5068
        normalizeRoundAndPackFloatx80(
5069
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5070

    
5071
}
5072

    
5073
/*----------------------------------------------------------------------------
5074
| Returns the square root of the extended double-precision floating-point
5075
| value `a'.  The operation is performed according to the IEC/IEEE Standard
5076
| for Binary Floating-Point Arithmetic.
5077
*----------------------------------------------------------------------------*/
5078

    
5079
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5080
{
5081
    flag aSign;
5082
    int32 aExp, zExp;
5083
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5084
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5085
    floatx80 z;
5086

    
5087
    aSig0 = extractFloatx80Frac( a );
5088
    aExp = extractFloatx80Exp( a );
5089
    aSign = extractFloatx80Sign( a );
5090
    if ( aExp == 0x7FFF ) {
5091
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5092
        if ( ! aSign ) return a;
5093
        goto invalid;
5094
    }
5095
    if ( aSign ) {
5096
        if ( ( aExp | aSig0 ) == 0 ) return a;
5097
 invalid:
5098
        float_raise( float_flag_invalid STATUS_VAR);
5099
        z.low = floatx80_default_nan_low;
5100
        z.high = floatx80_default_nan_high;
5101
        return z;
5102
    }
5103
    if ( aExp == 0 ) {
5104
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5105
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5106
    }
5107
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5108
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5109
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5110
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5111
    doubleZSig0 = zSig0<<1;
5112
    mul64To128( zSig0, zSig0, &term0, &term1 );
5113
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5114
    while ( (int64_t) rem0 < 0 ) {
5115
        --zSig0;
5116
        doubleZSig0 -= 2;
5117
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5118
    }
5119
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5120
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5121
        if ( zSig1 == 0 ) zSig1 = 1;
5122
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5123
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5124
        mul64To128( zSig1, zSig1, &term2, &term3 );
5125
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5126
        while ( (int64_t) rem1 < 0 ) {
5127
            --zSig1;
5128
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5129
            term3 |= 1;
5130
            term2 |= doubleZSig0;
5131
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5132
        }
5133
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5134
    }
5135
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5136
    zSig0 |= doubleZSig0;
5137
    return
5138
        roundAndPackFloatx80(
5139
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5140

    
5141
}
5142

    
5143
/*----------------------------------------------------------------------------
5144
| Returns 1 if the extended double-precision floating-point value `a' is equal
5145
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5146
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5147
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5148
*----------------------------------------------------------------------------*/
5149

    
5150
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5151
{
5152

    
5153
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5154
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5155
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5156
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5157
       ) {
5158
        float_raise( float_flag_invalid STATUS_VAR);
5159
        return 0;
5160
    }
5161
    return
5162
           ( a.low == b.low )
5163
        && (    ( a.high == b.high )
5164
             || (    ( a.low == 0 )
5165
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5166
           );
5167

    
5168
}
5169

    
5170
/*----------------------------------------------------------------------------
5171
| Returns 1 if the extended double-precision floating-point value `a' is
5172
| less than or equal to the corresponding value `b', and 0 otherwise.  The
5173
| invalid exception is raised if either operand is a NaN.  The comparison is
5174
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5175
| Arithmetic.
5176
*----------------------------------------------------------------------------*/
5177

    
5178
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5179
{
5180
    flag aSign, bSign;
5181

    
5182
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5183
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5184
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5185
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5186
       ) {
5187
        float_raise( float_flag_invalid STATUS_VAR);
5188
        return 0;
5189
    }
5190
    aSign = extractFloatx80Sign( a );
5191
    bSign = extractFloatx80Sign( b );
5192
    if ( aSign != bSign ) {
5193
        return
5194
               aSign
5195
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5196
                 == 0 );
5197
    }
5198
    return
5199
          aSign ? le128( b.high, b.low, a.high, a.low )
5200
        : le128( a.high, a.low, b.high, b.low );
5201

    
5202
}
5203

    
5204
/*----------------------------------------------------------------------------
5205
| Returns 1 if the extended double-precision floating-point value `a' is
5206
| less than the corresponding value `b', and 0 otherwise.  The invalid
5207
| exception is raised if either operand is a NaN.  The comparison is performed
5208
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5209
*----------------------------------------------------------------------------*/
5210

    
5211
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5212
{
5213
    flag aSign, bSign;
5214

    
5215
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5216
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5217
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5218
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5219
       ) {
5220
        float_raise( float_flag_invalid STATUS_VAR);
5221
        return 0;
5222
    }
5223
    aSign = extractFloatx80Sign( a );
5224
    bSign = extractFloatx80Sign( b );
5225
    if ( aSign != bSign ) {
5226
        return
5227
               aSign
5228
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5229
                 != 0 );
5230
    }
5231
    return
5232
          aSign ? lt128( b.high, b.low, a.high, a.low )
5233
        : lt128( a.high, a.low, b.high, b.low );
5234

    
5235
}
5236

    
5237
/*----------------------------------------------------------------------------
5238
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5239
| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5240
| either operand is a NaN.   The comparison is performed according to the
5241
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5242
*----------------------------------------------------------------------------*/
5243
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5244
{
5245
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5246
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5247
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5248
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5249
       ) {
5250
        float_raise( float_flag_invalid STATUS_VAR);
5251
        return 1;
5252
    }
5253
    return 0;
5254
}
5255

    
5256
/*----------------------------------------------------------------------------
5257
| Returns 1 if the extended double-precision floating-point value `a' is
5258
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5259
| cause an exception.  The comparison is performed according to the IEC/IEEE
5260
| Standard for Binary Floating-Point Arithmetic.
5261
*----------------------------------------------------------------------------*/
5262

    
5263
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5264
{
5265

    
5266
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5267
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5268
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5269
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5270
       ) {
5271
        if (    floatx80_is_signaling_nan( a )
5272
             || floatx80_is_signaling_nan( b ) ) {
5273
            float_raise( float_flag_invalid STATUS_VAR);
5274
        }
5275
        return 0;
5276
    }
5277
    return
5278
           ( a.low == b.low )
5279
        && (    ( a.high == b.high )
5280
             || (    ( a.low == 0 )
5281
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5282
           );
5283

    
5284
}
5285

    
5286
/*----------------------------------------------------------------------------
5287
| Returns 1 if the extended double-precision floating-point value `a' is less
5288
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5289
| do not cause an exception.  Otherwise, the comparison is performed according
5290
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5291
*----------------------------------------------------------------------------*/
5292

    
5293
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5294
{
5295
    flag aSign, bSign;
5296

    
5297
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5298
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5299
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5300
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5301
       ) {
5302
        if (    floatx80_is_signaling_nan( a )
5303
             || floatx80_is_signaling_nan( b ) ) {
5304
            float_raise( float_flag_invalid STATUS_VAR);
5305
        }
5306
        return 0;
5307
    }
5308
    aSign = extractFloatx80Sign( a );
5309
    bSign = extractFloatx80Sign( b );
5310
    if ( aSign != bSign ) {
5311
        return
5312
               aSign
5313
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5314
                 == 0 );
5315
    }
5316
    return
5317
          aSign ? le128( b.high, b.low, a.high, a.low )
5318
        : le128( a.high, a.low, b.high, b.low );
5319

    
5320
}
5321

    
5322
/*----------------------------------------------------------------------------
5323
| Returns 1 if the extended double-precision floating-point value `a' is less
5324
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5325
| an exception.  Otherwise, the comparison is performed according to the
5326
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327
*----------------------------------------------------------------------------*/
5328

    
5329
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5330
{
5331
    flag aSign, bSign;
5332

    
5333
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5334
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5335
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5336
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5337
       ) {
5338
        if (    floatx80_is_signaling_nan( a )
5339
             || floatx80_is_signaling_nan( b ) ) {
5340
            float_raise( float_flag_invalid STATUS_VAR);
5341
        }
5342
        return 0;
5343
    }
5344
    aSign = extractFloatx80Sign( a );
5345
    bSign = extractFloatx80Sign( b );
5346
    if ( aSign != bSign ) {
5347
        return
5348
               aSign
5349
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5350
                 != 0 );
5351
    }
5352
    return
5353
          aSign ? lt128( b.high, b.low, a.high, a.low )
5354
        : lt128( a.high, a.low, b.high, b.low );
5355

    
5356
}
5357

    
5358
/*----------------------------------------------------------------------------
5359
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5360
| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5361
| The comparison is performed according to the IEC/IEEE Standard for Binary
5362
| Floating-Point Arithmetic.
5363
*----------------------------------------------------------------------------*/
5364
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5365
{
5366
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5367
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5368
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5369
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5370
       ) {
5371
        if (    floatx80_is_signaling_nan( a )
5372
             || floatx80_is_signaling_nan( b ) ) {
5373
            float_raise( float_flag_invalid STATUS_VAR);
5374
        }
5375
        return 1;
5376
    }
5377
    return 0;
5378
}
5379

    
5380
/*----------------------------------------------------------------------------
5381
| Returns the result of converting the quadruple-precision floating-point
5382
| value `a' to the 32-bit two's complement integer format.  The conversion
5383
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5384
| Arithmetic---which means in particular that the conversion is rounded
5385
| according to the current rounding mode.  If `a' is a NaN, the largest
5386
| positive integer is returned.  Otherwise, if the conversion overflows, the
5387
| largest integer with the same sign as `a' is returned.
5388
*----------------------------------------------------------------------------*/
5389

    
5390
int32 float128_to_int32( float128 a STATUS_PARAM )
5391
{
5392
    flag aSign;
5393
    int32 aExp, shiftCount;
5394
    uint64_t aSig0, aSig1;
5395

    
5396
    aSig1 = extractFloat128Frac1( a );
5397
    aSig0 = extractFloat128Frac0( a );
5398
    aExp = extractFloat128Exp( a );
5399
    aSign = extractFloat128Sign( a );
5400
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5401
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5402
    aSig0 |= ( aSig1 != 0 );
5403
    shiftCount = 0x4028 - aExp;
5404
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5405
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5406

    
5407
}
5408

    
5409
/*----------------------------------------------------------------------------
5410
| Returns the result of converting the quadruple-precision floating-point
5411
| value `a' to the 32-bit two's complement integer format.  The conversion
5412
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5413
| Arithmetic, except that the conversion is always rounded toward zero.  If
5414
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5415
| conversion overflows, the largest integer with the same sign as `a' is
5416
| returned.
5417
*----------------------------------------------------------------------------*/
5418

    
5419
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5420
{
5421
    flag aSign;
5422
    int32 aExp, shiftCount;
5423
    uint64_t aSig0, aSig1, savedASig;
5424
    int32_t z;
5425

    
5426
    aSig1 = extractFloat128Frac1( a );
5427
    aSig0 = extractFloat128Frac0( a );
5428
    aExp = extractFloat128Exp( a );
5429
    aSign = extractFloat128Sign( a );
5430
    aSig0 |= ( aSig1 != 0 );
5431
    if ( 0x401E < aExp ) {
5432
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5433
        goto invalid;
5434
    }
5435
    else if ( aExp < 0x3FFF ) {
5436
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5437
        return 0;
5438
    }
5439
    aSig0 |= LIT64( 0x0001000000000000 );
5440
    shiftCount = 0x402F - aExp;
5441
    savedASig = aSig0;
5442
    aSig0 >>= shiftCount;
5443
    z = aSig0;
5444
    if ( aSign ) z = - z;
5445
    if ( ( z < 0 ) ^ aSign ) {
5446
 invalid:
5447
        float_raise( float_flag_invalid STATUS_VAR);
5448
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5449
    }
5450
    if ( ( aSig0<<shiftCount ) != savedASig ) {
5451
        STATUS(float_exception_flags) |= float_flag_inexact;
5452
    }
5453
    return z;
5454

    
5455
}
5456

    
5457
/*----------------------------------------------------------------------------
5458
| Returns the result of converting the quadruple-precision floating-point
5459
| value `a' to the 64-bit two's complement integer format.  The conversion
5460
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5461
| Arithmetic---which means in particular that the conversion is rounded
5462
| according to the current rounding mode.  If `a' is a NaN, the largest
5463
| positive integer is returned.  Otherwise, if the conversion overflows, the
5464
| largest integer with the same sign as `a' is returned.
5465
*----------------------------------------------------------------------------*/
5466

    
5467
int64 float128_to_int64( float128 a STATUS_PARAM )
5468
{
5469
    flag aSign;
5470
    int32 aExp, shiftCount;
5471
    uint64_t aSig0, aSig1;
5472

    
5473
    aSig1 = extractFloat128Frac1( a );
5474
    aSig0 = extractFloat128Frac0( a );
5475
    aExp = extractFloat128Exp( a );
5476
    aSign = extractFloat128Sign( a );
5477
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5478
    shiftCount = 0x402F - aExp;
5479
    if ( shiftCount <= 0 ) {
5480
        if ( 0x403E < aExp ) {
5481
            float_raise( float_flag_invalid STATUS_VAR);
5482
            if (    ! aSign
5483
                 || (    ( aExp == 0x7FFF )
5484
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5485
                    )
5486
               ) {
5487
                return LIT64( 0x7FFFFFFFFFFFFFFF );
5488
            }
5489
            return (int64_t) LIT64( 0x8000000000000000 );
5490
        }
5491
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5492
    }
5493
    else {
5494
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5495
    }
5496
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5497

    
5498
}
5499

    
5500
/*----------------------------------------------------------------------------
5501
| Returns the result of converting the quadruple-precision floating-point
5502
| value `a' to the 64-bit two's complement integer format.  The conversion
5503
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5504
| Arithmetic, except that the conversion is always rounded toward zero.
5505
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5506
| the conversion overflows, the largest integer with the same sign as `a' is
5507
| returned.
5508
*----------------------------------------------------------------------------*/
5509

    
5510
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5511
{
5512
    flag aSign;
5513
    int32 aExp, shiftCount;
5514
    uint64_t aSig0, aSig1;
5515
    int64 z;
5516

    
5517
    aSig1 = extractFloat128Frac1( a );
5518
    aSig0 = extractFloat128Frac0( a );
5519
    aExp = extractFloat128Exp( a );
5520
    aSign = extractFloat128Sign( a );
5521
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5522
    shiftCount = aExp - 0x402F;
5523
    if ( 0 < shiftCount ) {
5524
        if ( 0x403E <= aExp ) {
5525
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5526
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5527
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5528
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5529
            }
5530
            else {
5531
                float_raise( float_flag_invalid STATUS_VAR);
5532
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5533
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5534
                }
5535
            }
5536
            return (int64_t) LIT64( 0x8000000000000000 );
5537
        }
5538
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5539
        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5540
            STATUS(float_exception_flags) |= float_flag_inexact;
5541
        }
5542
    }
5543
    else {
5544
        if ( aExp < 0x3FFF ) {
5545
            if ( aExp | aSig0 | aSig1 ) {
5546
                STATUS(float_exception_flags) |= float_flag_inexact;
5547
            }
5548
            return 0;
5549
        }
5550
        z = aSig0>>( - shiftCount );
5551
        if (    aSig1
5552
             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5553
            STATUS(float_exception_flags) |= float_flag_inexact;
5554
        }
5555
    }
5556
    if ( aSign ) z = - z;
5557
    return z;
5558

    
5559
}
5560

    
5561
/*----------------------------------------------------------------------------
5562
| Returns the result of converting the quadruple-precision floating-point
5563
| value `a' to the single-precision floating-point format.  The conversion
5564
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5565
| Arithmetic.
5566
*----------------------------------------------------------------------------*/
5567

    
5568
float32 float128_to_float32( float128 a STATUS_PARAM )
5569
{
5570
    flag aSign;
5571
    int32 aExp;
5572
    uint64_t aSig0, aSig1;
5573
    uint32_t zSig;
5574

    
5575
    aSig1 = extractFloat128Frac1( a );
5576
    aSig0 = extractFloat128Frac0( a );
5577
    aExp = extractFloat128Exp( a );
5578
    aSign = extractFloat128Sign( a );
5579
    if ( aExp == 0x7FFF ) {
5580
        if ( aSig0 | aSig1 ) {
5581
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5582
        }
5583
        return packFloat32( aSign, 0xFF, 0 );
5584
    }
5585
    aSig0 |= ( aSig1 != 0 );
5586
    shift64RightJamming( aSig0, 18, &aSig0 );
5587
    zSig = aSig0;
5588
    if ( aExp || zSig ) {
5589
        zSig |= 0x40000000;
5590
        aExp -= 0x3F81;
5591
    }
5592
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5593

    
5594
}
5595

    
5596
/*----------------------------------------------------------------------------
5597
| Returns the result of converting the quadruple-precision floating-point
5598
| value `a' to the double-precision floating-point format.  The conversion
5599
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5600
| Arithmetic.
5601
*----------------------------------------------------------------------------*/
5602

    
5603
float64 float128_to_float64( float128 a STATUS_PARAM )
5604
{
5605
    flag aSign;
5606
    int32 aExp;
5607
    uint64_t aSig0, aSig1;
5608

    
5609
    aSig1 = extractFloat128Frac1( a );
5610
    aSig0 = extractFloat128Frac0( a );
5611
    aExp = extractFloat128Exp( a );
5612
    aSign = extractFloat128Sign( a );
5613
    if ( aExp == 0x7FFF ) {
5614
        if ( aSig0 | aSig1 ) {
5615
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5616
        }
5617
        return packFloat64( aSign, 0x7FF, 0 );
5618
    }
5619
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5620
    aSig0 |= ( aSig1 != 0 );
5621
    if ( aExp || aSig0 ) {
5622
        aSig0 |= LIT64( 0x4000000000000000 );
5623
        aExp -= 0x3C01;
5624
    }
5625
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5626

    
5627
}
5628

    
5629
/*----------------------------------------------------------------------------
5630
| Returns the result of converting the quadruple-precision floating-point
5631
| value `a' to the extended double-precision floating-point format.  The
5632
| conversion is performed according to the IEC/IEEE Standard for Binary
5633
| Floating-Point Arithmetic.
5634
*----------------------------------------------------------------------------*/
5635

    
5636
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5637
{
5638
    flag aSign;
5639
    int32 aExp;
5640
    uint64_t aSig0, aSig1;
5641

    
5642
    aSig1 = extractFloat128Frac1( a );
5643
    aSig0 = extractFloat128Frac0( a );
5644
    aExp = extractFloat128Exp( a );
5645
    aSign = extractFloat128Sign( a );
5646
    if ( aExp == 0x7FFF ) {
5647
        if ( aSig0 | aSig1 ) {
5648
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5649
        }
5650
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5651
    }
5652
    if ( aExp == 0 ) {
5653
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5654
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5655
    }
5656
    else {
5657
        aSig0 |= LIT64( 0x0001000000000000 );
5658
    }
5659
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5660
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5661

    
5662
}
5663

    
5664
/*----------------------------------------------------------------------------
5665
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5666
| returns the result as a quadruple-precision floating-point value.  The
5667
| operation is performed according to the IEC/IEEE Standard for Binary
5668
| Floating-Point Arithmetic.
5669
*----------------------------------------------------------------------------*/
5670

    
5671
float128 float128_round_to_int( float128 a STATUS_PARAM )
5672
{
5673
    flag aSign;
5674
    int32 aExp;
5675
    uint64_t lastBitMask, roundBitsMask;
5676
    int8 roundingMode;
5677
    float128 z;
5678

    
5679
    aExp = extractFloat128Exp( a );
5680
    if ( 0x402F <= aExp ) {
5681
        if ( 0x406F <= aExp ) {
5682
            if (    ( aExp == 0x7FFF )
5683
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5684
               ) {
5685
                return propagateFloat128NaN( a, a STATUS_VAR );
5686
            }
5687
            return a;
5688
        }
5689
        lastBitMask = 1;
5690
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5691
        roundBitsMask = lastBitMask - 1;
5692
        z = a;
5693
        roundingMode = STATUS(float_rounding_mode);
5694
        if ( roundingMode == float_round_nearest_even ) {
5695
            if ( lastBitMask ) {
5696
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5697
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5698
            }
5699
            else {
5700
                if ( (int64_t) z.low < 0 ) {
5701
                    ++z.high;
5702
                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5703
                }
5704
            }
5705
        }
5706
        else if ( roundingMode != float_round_to_zero ) {
5707
            if (   extractFloat128Sign( z )
5708
                 ^ ( roundingMode == float_round_up ) ) {
5709
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5710
            }
5711
        }
5712
        z.low &= ~ roundBitsMask;
5713
    }
5714
    else {
5715
        if ( aExp < 0x3FFF ) {
5716
            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5717
            STATUS(float_exception_flags) |= float_flag_inexact;
5718
            aSign = extractFloat128Sign( a );
5719
            switch ( STATUS(float_rounding_mode) ) {
5720
             case float_round_nearest_even:
5721
                if (    ( aExp == 0x3FFE )
5722
                     && (   extractFloat128Frac0( a )
5723
                          | extractFloat128Frac1( a ) )
5724
                   ) {
5725
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5726
                }
5727
                break;
5728
             case float_round_down:
5729
                return
5730
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5731
                    : packFloat128( 0, 0, 0, 0 );
5732
             case float_round_up:
5733
                return
5734
                      aSign ? packFloat128( 1, 0, 0, 0 )
5735
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5736
            }
5737
            return packFloat128( aSign, 0, 0, 0 );
5738
        }
5739
        lastBitMask = 1;
5740
        lastBitMask <<= 0x402F - aExp;
5741
        roundBitsMask = lastBitMask - 1;
5742
        z.low = 0;
5743
        z.high = a.high;
5744
        roundingMode = STATUS(float_rounding_mode);
5745
        if ( roundingMode == float_round_nearest_even ) {
5746
            z.high += lastBitMask>>1;
5747
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5748
                z.high &= ~ lastBitMask;
5749
            }
5750
        }
5751
        else if ( roundingMode != float_round_to_zero ) {
5752
            if (   extractFloat128Sign( z )
5753
                 ^ ( roundingMode == float_round_up ) ) {
5754
                z.high |= ( a.low != 0 );
5755
                z.high += roundBitsMask;
5756
            }
5757
        }
5758
        z.high &= ~ roundBitsMask;
5759
    }
5760
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5761
        STATUS(float_exception_flags) |= float_flag_inexact;
5762
    }
5763
    return z;
5764

    
5765
}
5766

    
5767
/*----------------------------------------------------------------------------
5768
| Returns the result of adding the absolute values of the quadruple-precision
5769
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5770
| before being returned.  `zSign' is ignored if the result is a NaN.
5771
| The addition is performed according to the IEC/IEEE Standard for Binary
5772
| Floating-Point Arithmetic.
5773
*----------------------------------------------------------------------------*/
5774

    
5775
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5776
{
5777
    int32 aExp, bExp, zExp;
5778
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5779
    int32 expDiff;
5780

    
5781
    aSig1 = extractFloat128Frac1( a );
5782
    aSig0 = extractFloat128Frac0( a );
5783
    aExp = extractFloat128Exp( a );
5784
    bSig1 = extractFloat128Frac1( b );
5785
    bSig0 = extractFloat128Frac0( b );
5786
    bExp = extractFloat128Exp( b );
5787
    expDiff = aExp - bExp;
5788
    if ( 0 < expDiff ) {
5789
        if ( aExp == 0x7FFF ) {
5790
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5791
            return a;
5792
        }
5793
        if ( bExp == 0 ) {
5794
            --expDiff;
5795
        }
5796
        else {
5797
            bSig0 |= LIT64( 0x0001000000000000 );
5798
        }
5799
        shift128ExtraRightJamming(
5800
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5801
        zExp = aExp;
5802
    }
5803
    else if ( expDiff < 0 ) {
5804
        if ( bExp == 0x7FFF ) {
5805
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5806
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5807
        }
5808
        if ( aExp == 0 ) {
5809
            ++expDiff;
5810
        }
5811
        else {
5812
            aSig0 |= LIT64( 0x0001000000000000 );
5813
        }
5814
        shift128ExtraRightJamming(
5815
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5816
        zExp = bExp;
5817
    }
5818
    else {
5819
        if ( aExp == 0x7FFF ) {
5820
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5821
                return propagateFloat128NaN( a, b STATUS_VAR );
5822
            }
5823
            return a;
5824
        }
5825
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5826
        if ( aExp == 0 ) {
5827
            if (STATUS(flush_to_zero)) {
5828
                if (zSig0 | zSig1) {
5829
                    float_raise(float_flag_output_denormal STATUS_VAR);
5830
                }
5831
                return packFloat128(zSign, 0, 0, 0);
5832
            }
5833
            return packFloat128( zSign, 0, zSig0, zSig1 );
5834
        }
5835
        zSig2 = 0;
5836
        zSig0 |= LIT64( 0x0002000000000000 );
5837
        zExp = aExp;
5838
        goto shiftRight1;
5839
    }
5840
    aSig0 |= LIT64( 0x0001000000000000 );
5841
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5842
    --zExp;
5843
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5844
    ++zExp;
5845
 shiftRight1:
5846
    shift128ExtraRightJamming(
5847
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5848
 roundAndPack:
5849
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5850

    
5851
}
5852

    
5853
/*----------------------------------------------------------------------------
5854
| Returns the result of subtracting the absolute values of the quadruple-
5855
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5856
| difference is negated before being returned.  `zSign' is ignored if the
5857
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5858
| Standard for Binary Floating-Point Arithmetic.
5859
*----------------------------------------------------------------------------*/
5860

    
5861
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5862
{
5863
    int32 aExp, bExp, zExp;
5864
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5865
    int32 expDiff;
5866
    float128 z;
5867

    
5868
    aSig1 = extractFloat128Frac1( a );
5869
    aSig0 = extractFloat128Frac0( a );
5870
    aExp = extractFloat128Exp( a );
5871
    bSig1 = extractFloat128Frac1( b );
5872
    bSig0 = extractFloat128Frac0( b );
5873
    bExp = extractFloat128Exp( b );
5874
    expDiff = aExp - bExp;
5875
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5876
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5877
    if ( 0 < expDiff ) goto aExpBigger;
5878
    if ( expDiff < 0 ) goto bExpBigger;
5879
    if ( aExp == 0x7FFF ) {
5880
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5881
            return propagateFloat128NaN( a, b STATUS_VAR );
5882
        }
5883
        float_raise( float_flag_invalid STATUS_VAR);
5884
        z.low = float128_default_nan_low;
5885
        z.high = float128_default_nan_high;
5886
        return z;
5887
    }
5888
    if ( aExp == 0 ) {
5889
        aExp = 1;
5890
        bExp = 1;
5891
    }
5892
    if ( bSig0 < aSig0 ) goto aBigger;
5893
    if ( aSig0 < bSig0 ) goto bBigger;
5894
    if ( bSig1 < aSig1 ) goto aBigger;
5895
    if ( aSig1 < bSig1 ) goto bBigger;
5896
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5897
 bExpBigger:
5898
    if ( bExp == 0x7FFF ) {
5899
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5900
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5901
    }
5902
    if ( aExp == 0 ) {
5903
        ++expDiff;
5904
    }
5905
    else {
5906
        aSig0 |= LIT64( 0x4000000000000000 );
5907
    }
5908
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5909
    bSig0 |= LIT64( 0x4000000000000000 );
5910
 bBigger:
5911
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5912
    zExp = bExp;
5913
    zSign ^= 1;
5914
    goto normalizeRoundAndPack;
5915
 aExpBigger:
5916
    if ( aExp == 0x7FFF ) {
5917
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5918
        return a;
5919
    }
5920
    if ( bExp == 0 ) {
5921
        --expDiff;
5922
    }
5923
    else {
5924
        bSig0 |= LIT64( 0x4000000000000000 );
5925
    }
5926
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5927
    aSig0 |= LIT64( 0x4000000000000000 );
5928
 aBigger:
5929
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5930
    zExp = aExp;
5931
 normalizeRoundAndPack:
5932
    --zExp;
5933
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5934

    
5935
}
5936

    
5937
/*----------------------------------------------------------------------------
5938
| Returns the result of adding the quadruple-precision floating-point values
5939
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5940
| for Binary Floating-Point Arithmetic.
5941
*----------------------------------------------------------------------------*/
5942

    
5943
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5944
{
5945
    flag aSign, bSign;
5946

    
5947
    aSign = extractFloat128Sign( a );
5948
    bSign = extractFloat128Sign( b );
5949
    if ( aSign == bSign ) {
5950
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5951
    }
5952
    else {
5953
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5954
    }
5955

    
5956
}
5957

    
5958
/*----------------------------------------------------------------------------
5959
| Returns the result of subtracting the quadruple-precision floating-point
5960
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5961
| Standard for Binary Floating-Point Arithmetic.
5962
*----------------------------------------------------------------------------*/
5963

    
5964
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5965
{
5966
    flag aSign, bSign;
5967

    
5968
    aSign = extractFloat128Sign( a );
5969
    bSign = extractFloat128Sign( b );
5970
    if ( aSign == bSign ) {
5971
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5972
    }
5973
    else {
5974
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5975
    }
5976

    
5977
}
5978

    
5979
/*----------------------------------------------------------------------------
5980
| Returns the result of multiplying the quadruple-precision floating-point
5981
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5982
| Standard for Binary Floating-Point Arithmetic.
5983
*----------------------------------------------------------------------------*/
5984

    
5985
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5986
{
5987
    flag aSign, bSign, zSign;
5988
    int32 aExp, bExp, zExp;
5989
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5990
    float128 z;
5991

    
5992
    aSig1 = extractFloat128Frac1( a );
5993
    aSig0 = extractFloat128Frac0( a );
5994
    aExp = extractFloat128Exp( a );
5995
    aSign = extractFloat128Sign( a );
5996
    bSig1 = extractFloat128Frac1( b );
5997
    bSig0 = extractFloat128Frac0( b );
5998
    bExp = extractFloat128Exp( b );
5999
    bSign = extractFloat128Sign( b );
6000
    zSign = aSign ^ bSign;
6001
    if ( aExp == 0x7FFF ) {
6002
        if (    ( aSig0 | aSig1 )
6003
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6004
            return propagateFloat128NaN( a, b STATUS_VAR );
6005
        }
6006
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6007
        return packFloat128( zSign, 0x7FFF, 0, 0 );
6008
    }
6009
    if ( bExp == 0x7FFF ) {
6010
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6011
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6012
 invalid:
6013
            float_raise( float_flag_invalid STATUS_VAR);
6014
            z.low = float128_default_nan_low;
6015
            z.high = float128_default_nan_high;
6016
            return z;
6017
        }
6018
        return packFloat128( zSign, 0x7FFF, 0, 0 );
6019
    }
6020
    if ( aExp == 0 ) {
6021
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6022
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6023
    }
6024
    if ( bExp == 0 ) {
6025
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6026
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6027
    }
6028
    zExp = aExp + bExp - 0x4000;
6029
    aSig0 |= LIT64( 0x0001000000000000 );
6030
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6031
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6032
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6033
    zSig2 |= ( zSig3 != 0 );
6034
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6035
        shift128ExtraRightJamming(
6036
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6037
        ++zExp;
6038
    }
6039
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6040

    
6041
}
6042

    
6043
/*----------------------------------------------------------------------------
6044
| Returns the result of dividing the quadruple-precision floating-point value
6045
| `a' by the corresponding value `b'.  The operation is performed according to
6046
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6047
*----------------------------------------------------------------------------*/
6048

    
6049
float128 float128_div( float128 a, float128 b STATUS_PARAM )
6050
{
6051
    flag aSign, bSign, zSign;
6052
    int32 aExp, bExp, zExp;
6053
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6054
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6055
    float128 z;
6056

    
6057
    aSig1 = extractFloat128Frac1( a );
6058
    aSig0 = extractFloat128Frac0( a );
6059
    aExp = extractFloat128Exp( a );
6060
    aSign = extractFloat128Sign( a );
6061
    bSig1 = extractFloat128Frac1( b );
6062
    bSig0 = extractFloat128Frac0( b );
6063
    bExp = extractFloat128Exp( b );
6064
    bSign = extractFloat128Sign( b );
6065
    zSign = aSign ^ bSign;
6066
    if ( aExp == 0x7FFF ) {
6067
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6068
        if ( bExp == 0x7FFF ) {
6069
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6070
            goto invalid;
6071
        }
6072
        return packFloat128( zSign, 0x7FFF, 0, 0 );
6073
    }
6074
    if ( bExp == 0x7FFF ) {
6075
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6076
        return packFloat128( zSign, 0, 0, 0 );
6077
    }
6078
    if ( bExp == 0 ) {
6079
        if ( ( bSig0 | bSig1 ) == 0 ) {
6080
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6081
 invalid:
6082
                float_raise( float_flag_invalid STATUS_VAR);
6083
                z.low = float128_default_nan_low;
6084
                z.high = float128_default_nan_high;
6085
                return z;
6086
            }
6087
            float_raise( float_flag_divbyzero STATUS_VAR);
6088
            return packFloat128( zSign, 0x7FFF, 0, 0 );
6089
        }
6090
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6091
    }
6092
    if ( aExp == 0 ) {
6093
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6094
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6095
    }
6096
    zExp = aExp - bExp + 0x3FFD;
6097
    shortShift128Left(
6098
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6099
    shortShift128Left(
6100
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6101
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6102
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6103
        ++zExp;
6104
    }
6105
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6106
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6107
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6108
    while ( (int64_t) rem0 < 0 ) {
6109
        --zSig0;
6110
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6111
    }
6112
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6113
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6114
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6115
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6116
        while ( (int64_t) rem1 < 0 ) {
6117
            --zSig1;
6118
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6119
        }
6120
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6121
    }
6122
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6123
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6124

    
6125
}
6126

    
6127
/*----------------------------------------------------------------------------
6128
| Returns the remainder of the quadruple-precision floating-point value `a'
6129
| with respect to the corresponding value `b'.  The operation is performed
6130
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6131
*----------------------------------------------------------------------------*/
6132

    
6133
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6134
{
6135
    flag aSign, zSign;
6136
    int32 aExp, bExp, expDiff;
6137
    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6138
    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6139
    int64_t sigMean0;
6140
    float128 z;
6141

    
6142
    aSig1 = extractFloat128Frac1( a );
6143
    aSig0 = extractFloat128Frac0( a );
6144
    aExp = extractFloat128Exp( a );
6145
    aSign = extractFloat128Sign( a );
6146
    bSig1 = extractFloat128Frac1( b );
6147
    bSig0 = extractFloat128Frac0( b );
6148
    bExp = extractFloat128Exp( b );
6149
    if ( aExp == 0x7FFF ) {
6150
        if (    ( aSig0 | aSig1 )
6151
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6152
            return propagateFloat128NaN( a, b STATUS_VAR );
6153
        }
6154
        goto invalid;
6155
    }
6156
    if ( bExp == 0x7FFF ) {
6157
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6158
        return a;
6159
    }
6160
    if ( bExp == 0 ) {
6161
        if ( ( bSig0 | bSig1 ) == 0 ) {
6162
 invalid:
6163
            float_raise( float_flag_invalid STATUS_VAR);
6164
            z.low = float128_default_nan_low;
6165
            z.high = float128_default_nan_high;
6166
            return z;
6167
        }
6168
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6169
    }
6170
    if ( aExp == 0 ) {
6171
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6172
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6173
    }
6174
    expDiff = aExp - bExp;
6175
    if ( expDiff < -1 ) return a;
6176
    shortShift128Left(
6177
        aSig0 | LIT64( 0x0001000000000000 ),
6178
        aSig1,
6179
        15 - ( expDiff < 0 ),
6180
        &aSig0,
6181
        &aSig1
6182
    );
6183
    shortShift128Left(
6184
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6185
    q = le128( bSig0, bSig1, aSig0, aSig1 );
6186
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6187
    expDiff -= 64;
6188
    while ( 0 < expDiff ) {
6189
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6190
        q = ( 4 < q ) ? q - 4 : 0;
6191
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6192
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6193
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6194
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6195
        expDiff -= 61;
6196
    }
6197
    if ( -64 < expDiff ) {
6198
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6199
        q = ( 4 < q ) ? q - 4 : 0;
6200
        q >>= - expDiff;
6201
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6202
        expDiff += 52;
6203
        if ( expDiff < 0 ) {
6204
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6205
        }
6206
        else {
6207
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6208
        }
6209
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6210
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6211
    }
6212
    else {
6213
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6214
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6215
    }
6216
    do {
6217
        alternateASig0 = aSig0;
6218
        alternateASig1 = aSig1;
6219
        ++q;
6220
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6221
    } while ( 0 <= (int64_t) aSig0 );
6222
    add128(
6223
        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6224
    if (    ( sigMean0 < 0 )
6225
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6226
        aSig0 = alternateASig0;
6227
        aSig1 = alternateASig1;
6228
    }
6229
    zSign = ( (int64_t) aSig0 < 0 );
6230
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6231
    return
6232
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6233

    
6234
}
6235

    
6236
/*----------------------------------------------------------------------------
6237
| Returns the square root of the quadruple-precision floating-point value `a'.
6238
| The operation is performed according to the IEC/IEEE Standard for Binary
6239
| Floating-Point Arithmetic.
6240
*----------------------------------------------------------------------------*/
6241

    
6242
float128 float128_sqrt( float128 a STATUS_PARAM )
6243
{
6244
    flag aSign;
6245
    int32 aExp, zExp;
6246
    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6247
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6248
    float128 z;
6249

    
6250
    aSig1 = extractFloat128Frac1( a );
6251
    aSig0 = extractFloat128Frac0( a );
6252
    aExp = extractFloat128Exp( a );
6253
    aSign = extractFloat128Sign( a );
6254
    if ( aExp == 0x7FFF ) {
6255
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6256
        if ( ! aSign ) return a;
6257
        goto invalid;
6258
    }
6259
    if ( aSign ) {
6260
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6261
 invalid:
6262
        float_raise( float_flag_invalid STATUS_VAR);
6263
        z.low = float128_default_nan_low;
6264
        z.high = float128_default_nan_high;
6265
        return z;
6266
    }
6267
    if ( aExp == 0 ) {
6268
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6269
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6270
    }
6271
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6272
    aSig0 |= LIT64( 0x0001000000000000 );
6273
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6274
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6275
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6276
    doubleZSig0 = zSig0<<1;
6277
    mul64To128( zSig0, zSig0, &term0, &term1 );
6278
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6279
    while ( (int64_t) rem0 < 0 ) {
6280
        --zSig0;
6281
        doubleZSig0 -= 2;
6282
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6283
    }
6284
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6285
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6286
        if ( zSig1 == 0 ) zSig1 = 1;
6287
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6288
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6289
        mul64To128( zSig1, zSig1, &term2, &term3 );
6290
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6291
        while ( (int64_t) rem1 < 0 ) {
6292
            --zSig1;
6293
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6294
            term3 |= 1;
6295
            term2 |= doubleZSig0;
6296
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6297
        }
6298
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6299
    }
6300
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6301
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6302

    
6303
}
6304

    
6305
/*----------------------------------------------------------------------------
6306
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6307
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6308
| raised if either operand is a NaN.  Otherwise, the comparison is performed
6309
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6310
*----------------------------------------------------------------------------*/
6311

    
6312
int float128_eq( float128 a, float128 b STATUS_PARAM )
6313
{
6314

    
6315
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6316
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6317
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6318
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6319
       ) {
6320
        float_raise( float_flag_invalid STATUS_VAR);
6321
        return 0;
6322
    }
6323
    return
6324
           ( a.low == b.low )
6325
        && (    ( a.high == b.high )
6326
             || (    ( a.low == 0 )
6327
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6328
           );
6329

    
6330
}
6331

    
6332
/*----------------------------------------------------------------------------
6333
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6334
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6335
| exception is raised if either operand is a NaN.  The comparison is performed
6336
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6337
*----------------------------------------------------------------------------*/
6338

    
6339
int float128_le( float128 a, float128 b STATUS_PARAM )
6340
{
6341
    flag aSign, bSign;
6342

    
6343
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6344
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6345
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6346
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6347
       ) {
6348
        float_raise( float_flag_invalid STATUS_VAR);
6349
        return 0;
6350
    }
6351
    aSign = extractFloat128Sign( a );
6352
    bSign = extractFloat128Sign( b );
6353
    if ( aSign != bSign ) {
6354
        return
6355
               aSign
6356
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6357
                 == 0 );
6358
    }
6359
    return
6360
          aSign ? le128( b.high, b.low, a.high, a.low )
6361
        : le128( a.high, a.low, b.high, b.low );
6362

    
6363
}
6364

    
6365
/*----------------------------------------------------------------------------
6366
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6367
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6368
| raised if either operand is a NaN.  The comparison is performed according
6369
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6370
*----------------------------------------------------------------------------*/
6371

    
6372
int float128_lt( float128 a, float128 b STATUS_PARAM )
6373
{
6374
    flag aSign, bSign;
6375

    
6376
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6377
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6378
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6379
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6380
       ) {
6381
        float_raise( float_flag_invalid STATUS_VAR);
6382
        return 0;
6383
    }
6384
    aSign = extractFloat128Sign( a );
6385
    bSign = extractFloat128Sign( b );
6386
    if ( aSign != bSign ) {
6387
        return
6388
               aSign
6389
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6390
                 != 0 );
6391
    }
6392
    return
6393
          aSign ? lt128( b.high, b.low, a.high, a.low )
6394
        : lt128( a.high, a.low, b.high, b.low );
6395

    
6396
}
6397

    
6398
/*----------------------------------------------------------------------------
6399
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6400
| be compared, and 0 otherwise.  The invalid exception is raised if either
6401
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6402
| Standard for Binary Floating-Point Arithmetic.
6403
*----------------------------------------------------------------------------*/
6404

    
6405
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6406
{
6407
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6408
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6409
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6410
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6411
       ) {
6412
        float_raise( float_flag_invalid STATUS_VAR);
6413
        return 1;
6414
    }
6415
    return 0;
6416
}
6417

    
6418
/*----------------------------------------------------------------------------
6419
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6420
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6421
| exception.  The comparison is performed according to the IEC/IEEE Standard
6422
| for Binary Floating-Point Arithmetic.
6423
*----------------------------------------------------------------------------*/
6424

    
6425
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6426
{
6427

    
6428
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6429
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6430
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6431
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6432
       ) {
6433
        if (    float128_is_signaling_nan( a )
6434
             || float128_is_signaling_nan( b ) ) {
6435
            float_raise( float_flag_invalid STATUS_VAR);
6436
        }
6437
        return 0;
6438
    }
6439
    return
6440
           ( a.low == b.low )
6441
        && (    ( a.high == b.high )
6442
             || (    ( a.low == 0 )
6443
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6444
           );
6445

    
6446
}
6447

    
6448
/*----------------------------------------------------------------------------
6449
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6450
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6451
| cause an exception.  Otherwise, the comparison is performed according to the
6452
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6453
*----------------------------------------------------------------------------*/
6454

    
6455
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6456
{
6457
    flag aSign, bSign;
6458

    
6459
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6460
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6461
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6462
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6463
       ) {
6464
        if (    float128_is_signaling_nan( a )
6465
             || float128_is_signaling_nan( b ) ) {
6466
            float_raise( float_flag_invalid STATUS_VAR);
6467
        }
6468
        return 0;
6469
    }
6470
    aSign = extractFloat128Sign( a );
6471
    bSign = extractFloat128Sign( b );
6472
    if ( aSign != bSign ) {
6473
        return
6474
               aSign
6475
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6476
                 == 0 );
6477
    }
6478
    return
6479
          aSign ? le128( b.high, b.low, a.high, a.low )
6480
        : le128( a.high, a.low, b.high, b.low );
6481

    
6482
}
6483

    
6484
/*----------------------------------------------------------------------------
6485
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6486
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6487
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6488
| Standard for Binary Floating-Point Arithmetic.
6489
*----------------------------------------------------------------------------*/
6490

    
6491
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6492
{
6493
    flag aSign, bSign;
6494

    
6495
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6496
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6497
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6498
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6499
       ) {
6500
        if (    float128_is_signaling_nan( a )
6501
             || float128_is_signaling_nan( b ) ) {
6502
            float_raise( float_flag_invalid STATUS_VAR);
6503
        }
6504
        return 0;
6505
    }
6506
    aSign = extractFloat128Sign( a );
6507
    bSign = extractFloat128Sign( b );
6508
    if ( aSign != bSign ) {
6509
        return
6510
               aSign
6511
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6512
                 != 0 );
6513
    }
6514
    return
6515
          aSign ? lt128( b.high, b.low, a.high, a.low )
6516
        : lt128( a.high, a.low, b.high, b.low );
6517

    
6518
}
6519

    
6520
/*----------------------------------------------------------------------------
6521
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6522
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6523
| comparison is performed according to the IEC/IEEE Standard for Binary
6524
| Floating-Point Arithmetic.
6525
*----------------------------------------------------------------------------*/
6526

    
6527
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6528
{
6529
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6530
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6531
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6532
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6533
       ) {
6534
        if (    float128_is_signaling_nan( a )
6535
             || float128_is_signaling_nan( b ) ) {
6536
            float_raise( float_flag_invalid STATUS_VAR);
6537
        }
6538
        return 1;
6539
    }
6540
    return 0;
6541
}
6542

    
6543
/* misc functions */
6544
float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6545
{
6546
    return int64_to_float32(a STATUS_VAR);
6547
}
6548

    
6549
float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6550
{
6551
    return int64_to_float64(a STATUS_VAR);
6552
}
6553

    
6554
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6555
{
6556
    int64_t v;
6557
    uint32 res;
6558
    int old_exc_flags = get_float_exception_flags(status);
6559

    
6560
    v = float32_to_int64(a STATUS_VAR);
6561
    if (v < 0) {
6562
        res = 0;
6563
    } else if (v > 0xffffffff) {
6564
        res = 0xffffffff;
6565
    } else {
6566
        return v;
6567
    }
6568
    set_float_exception_flags(old_exc_flags, status);
6569
    float_raise(float_flag_invalid STATUS_VAR);
6570
    return res;
6571
}
6572

    
6573
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6574
{
6575
    int64_t v;
6576
    uint32 res;
6577
    int old_exc_flags = get_float_exception_flags(status);
6578

    
6579
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6580
    if (v < 0) {
6581
        res = 0;
6582
    } else if (v > 0xffffffff) {
6583
        res = 0xffffffff;
6584
    } else {
6585
        return v;
6586
    }
6587
    set_float_exception_flags(old_exc_flags, status);
6588
    float_raise(float_flag_invalid STATUS_VAR);
6589
    return res;
6590
}
6591

    
6592
int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6593
{
6594
    int32_t v;
6595
    int_fast16_t res;
6596
    int old_exc_flags = get_float_exception_flags(status);
6597

    
6598
    v = float32_to_int32(a STATUS_VAR);
6599
    if (v < -0x8000) {
6600
        res = -0x8000;
6601
    } else if (v > 0x7fff) {
6602
        res = 0x7fff;
6603
    } else {
6604
        return v;
6605
    }
6606

    
6607
    set_float_exception_flags(old_exc_flags, status);
6608
    float_raise(float_flag_invalid STATUS_VAR);
6609
    return res;
6610
}
6611

    
6612
uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6613
{
6614
    int32_t v;
6615
    uint_fast16_t res;
6616
    int old_exc_flags = get_float_exception_flags(status);
6617

    
6618
    v = float32_to_int32(a STATUS_VAR);
6619
    if (v < 0) {
6620
        res = 0;
6621
    } else if (v > 0xffff) {
6622
        res = 0xffff;
6623
    } else {
6624
        return v;
6625
    }
6626

    
6627
    set_float_exception_flags(old_exc_flags, status);
6628
    float_raise(float_flag_invalid STATUS_VAR);
6629
    return res;
6630
}
6631

    
6632
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6633
{
6634
    int64_t v;
6635
    uint_fast16_t res;
6636
    int old_exc_flags = get_float_exception_flags(status);
6637

    
6638
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6639
    if (v < 0) {
6640
        res = 0;
6641
    } else if (v > 0xffff) {
6642
        res = 0xffff;
6643
    } else {
6644
        return v;
6645
    }
6646
    set_float_exception_flags(old_exc_flags, status);
6647
    float_raise(float_flag_invalid STATUS_VAR);
6648
    return res;
6649
}
6650

    
6651
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6652
{
6653
    uint64_t v;
6654
    uint32 res;
6655
    int old_exc_flags = get_float_exception_flags(status);
6656

    
6657
    v = float64_to_uint64(a STATUS_VAR);
6658
    if (v > 0xffffffff) {
6659
        res = 0xffffffff;
6660
    } else {
6661
        return v;
6662
    }
6663
    set_float_exception_flags(old_exc_flags, status);
6664
    float_raise(float_flag_invalid STATUS_VAR);
6665
    return res;
6666
}
6667

    
6668
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6669
{
6670
    uint64_t v;
6671
    uint32 res;
6672
    int old_exc_flags = get_float_exception_flags(status);
6673

    
6674
    v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6675
    if (v > 0xffffffff) {
6676
        res = 0xffffffff;
6677
    } else {
6678
        return v;
6679
    }
6680
    set_float_exception_flags(old_exc_flags, status);
6681
    float_raise(float_flag_invalid STATUS_VAR);
6682
    return res;
6683
}
6684

    
6685
int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6686
{
6687
    int64_t v;
6688
    int_fast16_t res;
6689
    int old_exc_flags = get_float_exception_flags(status);
6690

    
6691
    v = float64_to_int32(a STATUS_VAR);
6692
    if (v < -0x8000) {
6693
        res = -0x8000;
6694
    } else if (v > 0x7fff) {
6695
        res = 0x7fff;
6696
    } else {
6697
        return v;
6698
    }
6699

    
6700
    set_float_exception_flags(old_exc_flags, status);
6701
    float_raise(float_flag_invalid STATUS_VAR);
6702
    return res;
6703
}
6704

    
6705
uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6706
{
6707
    int64_t v;
6708
    uint_fast16_t res;
6709
    int old_exc_flags = get_float_exception_flags(status);
6710

    
6711
    v = float64_to_int32(a STATUS_VAR);
6712
    if (v < 0) {
6713
        res = 0;
6714
    } else if (v > 0xffff) {
6715
        res = 0xffff;
6716
    } else {
6717
        return v;
6718
    }
6719

    
6720
    set_float_exception_flags(old_exc_flags, status);
6721
    float_raise(float_flag_invalid STATUS_VAR);
6722
    return res;
6723
}
6724

    
6725
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6726
{
6727
    int64_t v;
6728
    uint_fast16_t res;
6729
    int old_exc_flags = get_float_exception_flags(status);
6730

    
6731
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6732
    if (v < 0) {
6733
        res = 0;
6734
    } else if (v > 0xffff) {
6735
        res = 0xffff;
6736
    } else {
6737
        return v;
6738
    }
6739
    set_float_exception_flags(old_exc_flags, status);
6740
    float_raise(float_flag_invalid STATUS_VAR);
6741
    return res;
6742
}
6743

    
6744
/*----------------------------------------------------------------------------
6745
| Returns the result of converting the double-precision floating-point value
6746
| `a' to the 64-bit unsigned integer format.  The conversion is
6747
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6748
| Arithmetic---which means in particular that the conversion is rounded
6749
| according to the current rounding mode.  If `a' is a NaN, the largest
6750
| positive integer is returned.  If the conversion overflows, the
6751
| largest unsigned integer is returned.  If 'a' is negative, the value is
6752
| rounded and zero is returned; negative values that do not round to zero
6753
| will raise the inexact exception.
6754
*----------------------------------------------------------------------------*/
6755

    
6756
uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6757
{
6758
    flag aSign;
6759
    int_fast16_t aExp, shiftCount;
6760
    uint64_t aSig, aSigExtra;
6761
    a = float64_squash_input_denormal(a STATUS_VAR);
6762

    
6763
    aSig = extractFloat64Frac(a);
6764
    aExp = extractFloat64Exp(a);
6765
    aSign = extractFloat64Sign(a);
6766
    if (aSign && (aExp > 1022)) {
6767
        float_raise(float_flag_invalid STATUS_VAR);
6768
        if (float64_is_any_nan(a)) {
6769
            return LIT64(0xFFFFFFFFFFFFFFFF);
6770
        } else {
6771
            return 0;
6772
        }
6773
    }
6774
    if (aExp) {
6775
        aSig |= LIT64(0x0010000000000000);
6776
    }
6777
    shiftCount = 0x433 - aExp;
6778
    if (shiftCount <= 0) {
6779
        if (0x43E < aExp) {
6780
            float_raise(float_flag_invalid STATUS_VAR);
6781
            return LIT64(0xFFFFFFFFFFFFFFFF);
6782
        }
6783
        aSigExtra = 0;
6784
        aSig <<= -shiftCount;
6785
    } else {
6786
        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6787
    }
6788
    return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6789
}
6790

    
6791
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6792
{
6793
    signed char current_rounding_mode = STATUS(float_rounding_mode);
6794
    set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6795
    int64_t v = float64_to_uint64(a STATUS_VAR);
6796
    set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6797
    return v;
6798
}
6799

    
6800
#define COMPARE(s, nan_exp)                                                  \
6801
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6802
                                      int is_quiet STATUS_PARAM )            \
6803
{                                                                            \
6804
    flag aSign, bSign;                                                       \
6805
    uint ## s ## _t av, bv;                                                  \
6806
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6807
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6808
                                                                             \
6809
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6810
         extractFloat ## s ## Frac( a ) ) ||                                 \
6811
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6812
          extractFloat ## s ## Frac( b ) )) {                                \
6813
        if (!is_quiet ||                                                     \
6814
            float ## s ## _is_signaling_nan( a ) ||                          \
6815
            float ## s ## _is_signaling_nan( b ) ) {                         \
6816
            float_raise( float_flag_invalid STATUS_VAR);                     \
6817
        }                                                                    \
6818
        return float_relation_unordered;                                     \
6819
    }                                                                        \
6820
    aSign = extractFloat ## s ## Sign( a );                                  \
6821
    bSign = extractFloat ## s ## Sign( b );                                  \
6822
    av = float ## s ## _val(a);                                              \
6823
    bv = float ## s ## _val(b);                                              \
6824
    if ( aSign != bSign ) {                                                  \
6825
        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6826
            /* zero case */                                                  \
6827
            return float_relation_equal;                                     \
6828
        } else {                                                             \
6829
            return 1 - (2 * aSign);                                          \
6830
        }                                                                    \
6831
    } else {                                                                 \
6832
        if (av == bv) {                                                      \
6833
            return float_relation_equal;                                     \
6834
        } else {                                                             \
6835
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
6836
        }                                                                    \
6837
    }                                                                        \
6838
}                                                                            \
6839
                                                                             \
6840
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6841
{                                                                            \
6842
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6843
}                                                                            \
6844
                                                                             \
6845
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6846
{                                                                            \
6847
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6848
}
6849

    
6850
COMPARE(32, 0xff)
6851
COMPARE(64, 0x7ff)
6852

    
6853
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6854
                                      int is_quiet STATUS_PARAM )
6855
{
6856
    flag aSign, bSign;
6857

    
6858
    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6859
          ( extractFloatx80Frac( a )<<1 ) ) ||
6860
        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6861
          ( extractFloatx80Frac( b )<<1 ) )) {
6862
        if (!is_quiet ||
6863
            floatx80_is_signaling_nan( a ) ||
6864
            floatx80_is_signaling_nan( b ) ) {
6865
            float_raise( float_flag_invalid STATUS_VAR);
6866
        }
6867
        return float_relation_unordered;
6868
    }
6869
    aSign = extractFloatx80Sign( a );
6870
    bSign = extractFloatx80Sign( b );
6871
    if ( aSign != bSign ) {
6872

    
6873
        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6874
             ( ( a.low | b.low ) == 0 ) ) {
6875
            /* zero case */
6876
            return float_relation_equal;
6877
        } else {
6878
            return 1 - (2 * aSign);
6879
        }
6880
    } else {
6881
        if (a.low == b.low && a.high == b.high) {
6882
            return float_relation_equal;
6883
        } else {
6884
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6885
        }
6886
    }
6887
}
6888

    
6889
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6890
{
6891
    return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6892
}
6893

    
6894
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6895
{
6896
    return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6897
}
6898

    
6899
INLINE int float128_compare_internal( float128 a, float128 b,
6900
                                      int is_quiet STATUS_PARAM )
6901
{
6902
    flag aSign, bSign;
6903

    
6904
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6905
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6906
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6907
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6908
        if (!is_quiet ||
6909
            float128_is_signaling_nan( a ) ||
6910
            float128_is_signaling_nan( b ) ) {
6911
            float_raise( float_flag_invalid STATUS_VAR);
6912
        }
6913
        return float_relation_unordered;
6914
    }
6915
    aSign = extractFloat128Sign( a );
6916
    bSign = extractFloat128Sign( b );
6917
    if ( aSign != bSign ) {
6918
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6919
            /* zero case */
6920
            return float_relation_equal;
6921
        } else {
6922
            return 1 - (2 * aSign);
6923
        }
6924
    } else {
6925
        if (a.low == b.low && a.high == b.high) {
6926
            return float_relation_equal;
6927
        } else {
6928
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6929
        }
6930
    }
6931
}
6932

    
6933
int float128_compare( float128 a, float128 b STATUS_PARAM )
6934
{
6935
    return float128_compare_internal(a, b, 0 STATUS_VAR);
6936
}
6937

    
6938
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6939
{
6940
    return float128_compare_internal(a, b, 1 STATUS_VAR);
6941
}
6942

    
6943
/* min() and max() functions. These can't be implemented as
6944
 * 'compare and pick one input' because that would mishandle
6945
 * NaNs and +0 vs -0.
6946
 *
6947
 * minnum() and maxnum() functions. These are similar to the min()
6948
 * and max() functions but if one of the arguments is a QNaN and
6949
 * the other is numerical then the numerical argument is returned.
6950
 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
6951
 * and maxNum() operations. min() and max() are the typical min/max
6952
 * semantics provided by many CPUs which predate that specification.
6953
 */
6954
#define MINMAX(s)                                                       \
6955
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6956
                                        int ismin, int isieee STATUS_PARAM) \
6957
{                                                                       \
6958
    flag aSign, bSign;                                                  \
6959
    uint ## s ## _t av, bv;                                             \
6960
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6961
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6962
    if (float ## s ## _is_any_nan(a) ||                                 \
6963
        float ## s ## _is_any_nan(b)) {                                 \
6964
        if (isieee) {                                                   \
6965
            if (float ## s ## _is_quiet_nan(a) &&                       \
6966
                !float ## s ##_is_any_nan(b)) {                         \
6967
                return b;                                               \
6968
            } else if (float ## s ## _is_quiet_nan(b) &&                \
6969
                       !float ## s ## _is_any_nan(a)) {                 \
6970
                return a;                                               \
6971
            }                                                           \
6972
        }                                                               \
6973
        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6974
    }                                                                   \
6975
    aSign = extractFloat ## s ## Sign(a);                               \
6976
    bSign = extractFloat ## s ## Sign(b);                               \
6977
    av = float ## s ## _val(a);                                         \
6978
    bv = float ## s ## _val(b);                                         \
6979
    if (aSign != bSign) {                                               \
6980
        if (ismin) {                                                    \
6981
            return aSign ? a : b;                                       \
6982
        } else {                                                        \
6983
            return aSign ? b : a;                                       \
6984
        }                                                               \
6985
    } else {                                                            \
6986
        if (ismin) {                                                    \
6987
            return (aSign ^ (av < bv)) ? a : b;                         \
6988
        } else {                                                        \
6989
            return (aSign ^ (av < bv)) ? b : a;                         \
6990
        }                                                               \
6991
    }                                                                   \
6992
}                                                                       \
6993
                                                                        \
6994
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6995
{                                                                       \
6996
    return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
6997
}                                                                       \
6998
                                                                        \
6999
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7000
{                                                                       \
7001
    return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
7002
}                                                                       \
7003
                                                                        \
7004
float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7005
{                                                                       \
7006
    return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
7007
}                                                                       \
7008
                                                                        \
7009
float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7010
{                                                                       \
7011
    return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
7012
}
7013

    
7014
MINMAX(32)
7015
MINMAX(64)
7016

    
7017

    
7018
/* Multiply A by 2 raised to the power N.  */
7019
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7020
{
7021
    flag aSign;
7022
    int16_t aExp;
7023
    uint32_t aSig;
7024

    
7025
    a = float32_squash_input_denormal(a STATUS_VAR);
7026
    aSig = extractFloat32Frac( a );
7027
    aExp = extractFloat32Exp( a );
7028
    aSign = extractFloat32Sign( a );
7029

    
7030
    if ( aExp == 0xFF ) {
7031
        if ( aSig ) {
7032
            return propagateFloat32NaN( a, a STATUS_VAR );
7033
        }
7034
        return a;
7035
    }
7036
    if (aExp != 0) {
7037
        aSig |= 0x00800000;
7038
    } else if (aSig == 0) {
7039
        return a;
7040
    } else {
7041
        aExp++;
7042
    }
7043

    
7044
    if (n > 0x200) {
7045
        n = 0x200;
7046
    } else if (n < -0x200) {
7047
        n = -0x200;
7048
    }
7049

    
7050
    aExp += n - 1;
7051
    aSig <<= 7;
7052
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7053
}
7054

    
7055
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7056
{
7057
    flag aSign;
7058
    int16_t aExp;
7059
    uint64_t aSig;
7060

    
7061
    a = float64_squash_input_denormal(a STATUS_VAR);
7062
    aSig = extractFloat64Frac( a );
7063
    aExp = extractFloat64Exp( a );
7064
    aSign = extractFloat64Sign( a );
7065

    
7066
    if ( aExp == 0x7FF ) {
7067
        if ( aSig ) {
7068
            return propagateFloat64NaN( a, a STATUS_VAR );
7069
        }
7070
        return a;
7071
    }
7072
    if (aExp != 0) {
7073
        aSig |= LIT64( 0x0010000000000000 );
7074
    } else if (aSig == 0) {
7075
        return a;
7076
    } else {
7077
        aExp++;
7078
    }
7079

    
7080
    if (n > 0x1000) {
7081
        n = 0x1000;
7082
    } else if (n < -0x1000) {
7083
        n = -0x1000;
7084
    }
7085

    
7086
    aExp += n - 1;
7087
    aSig <<= 10;
7088
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7089
}
7090

    
7091
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7092
{
7093
    flag aSign;
7094
    int32_t aExp;
7095
    uint64_t aSig;
7096

    
7097
    aSig = extractFloatx80Frac( a );
7098
    aExp = extractFloatx80Exp( a );
7099
    aSign = extractFloatx80Sign( a );
7100

    
7101
    if ( aExp == 0x7FFF ) {
7102
        if ( aSig<<1 ) {
7103
            return propagateFloatx80NaN( a, a STATUS_VAR );
7104
        }
7105
        return a;
7106
    }
7107

    
7108
    if (aExp == 0) {
7109
        if (aSig == 0) {
7110
            return a;
7111
        }
7112
        aExp++;
7113
    }
7114

    
7115
    if (n > 0x10000) {
7116
        n = 0x10000;
7117
    } else if (n < -0x10000) {
7118
        n = -0x10000;
7119
    }
7120

    
7121
    aExp += n;
7122
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7123
                                          aSign, aExp, aSig, 0 STATUS_VAR );
7124
}
7125

    
7126
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7127
{
7128
    flag aSign;
7129
    int32_t aExp;
7130
    uint64_t aSig0, aSig1;
7131

    
7132
    aSig1 = extractFloat128Frac1( a );
7133
    aSig0 = extractFloat128Frac0( a );
7134
    aExp = extractFloat128Exp( a );
7135
    aSign = extractFloat128Sign( a );
7136
    if ( aExp == 0x7FFF ) {
7137
        if ( aSig0 | aSig1 ) {
7138
            return propagateFloat128NaN( a, a STATUS_VAR );
7139
        }
7140
        return a;
7141
    }
7142
    if (aExp != 0) {
7143
        aSig0 |= LIT64( 0x0001000000000000 );
7144
    } else if (aSig0 == 0 && aSig1 == 0) {
7145
        return a;
7146
    } else {
7147
        aExp++;
7148
    }
7149

    
7150
    if (n > 0x10000) {
7151
        n = 0x10000;
7152
    } else if (n < -0x10000) {
7153
        n = -0x10000;
7154
    }
7155

    
7156
    aExp += n - 1;
7157
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7158
                                          STATUS_VAR );
7159

    
7160
}