Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ f9288a76

History | View | Annotate | Download (260.3 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
 * target-dependent and needs the TARGET_* macros.
40
 */
41
#include "config.h"
42

    
43
#include "fpu/softfloat.h"
44

    
45
/* We only need stdlib for abort() */
46
#include <stdlib.h>
47

    
48
/*----------------------------------------------------------------------------
49
| Primitive arithmetic functions, including multi-word arithmetic, and
50
| division and square root approximations.  (Can be specialized to target if
51
| desired.)
52
*----------------------------------------------------------------------------*/
53
#include "softfloat-macros.h"
54

    
55
/*----------------------------------------------------------------------------
56
| Functions and definitions to determine:  (1) whether tininess for underflow
57
| is detected before or after rounding by default, (2) what (if anything)
58
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
59
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60
| are propagated from function inputs to output.  These details are target-
61
| specific.
62
*----------------------------------------------------------------------------*/
63
#include "softfloat-specialize.h"
64

    
65
/*----------------------------------------------------------------------------
66
| Returns the fraction bits of the half-precision floating-point value `a'.
67
*----------------------------------------------------------------------------*/
68

    
69
INLINE uint32_t extractFloat16Frac(float16 a)
70
{
71
    return float16_val(a) & 0x3ff;
72
}
73

    
74
/*----------------------------------------------------------------------------
75
| Returns the exponent bits of the half-precision floating-point value `a'.
76
*----------------------------------------------------------------------------*/
77

    
78
INLINE int_fast16_t extractFloat16Exp(float16 a)
79
{
80
    return (float16_val(a) >> 10) & 0x1f;
81
}
82

    
83
/*----------------------------------------------------------------------------
84
| Returns the sign bit of the single-precision floating-point value `a'.
85
*----------------------------------------------------------------------------*/
86

    
87
INLINE flag extractFloat16Sign(float16 a)
88
{
89
    return float16_val(a)>>15;
90
}
91

    
92
/*----------------------------------------------------------------------------
93
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94
| and 7, and returns the properly rounded 32-bit integer corresponding to the
95
| input.  If `zSign' is 1, the input is negated before being converted to an
96
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
97
| is simply rounded to an integer, with the inexact exception raised if the
98
| input cannot be represented exactly as an integer.  However, if the fixed-
99
| point input is too large, the invalid exception is raised and the largest
100
| positive or negative integer is returned.
101
*----------------------------------------------------------------------------*/
102

    
103
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
104
{
105
    int8 roundingMode;
106
    flag roundNearestEven;
107
    int8 roundIncrement, roundBits;
108
    int32_t z;
109

    
110
    roundingMode = STATUS(float_rounding_mode);
111
    roundNearestEven = ( roundingMode == float_round_nearest_even );
112
    switch (roundingMode) {
113
    case float_round_nearest_even:
114
    case float_round_ties_away:
115
        roundIncrement = 0x40;
116
        break;
117
    case float_round_to_zero:
118
        roundIncrement = 0;
119
        break;
120
    case float_round_up:
121
        roundIncrement = zSign ? 0 : 0x7f;
122
        break;
123
    case float_round_down:
124
        roundIncrement = zSign ? 0x7f : 0;
125
        break;
126
    default:
127
        abort();
128
    }
129
    roundBits = absZ & 0x7F;
130
    absZ = ( absZ + roundIncrement )>>7;
131
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132
    z = absZ;
133
    if ( zSign ) z = - z;
134
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135
        float_raise( float_flag_invalid STATUS_VAR);
136
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
137
    }
138
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
139
    return z;
140

    
141
}
142

    
143
/*----------------------------------------------------------------------------
144
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145
| `absZ1', with binary point between bits 63 and 64 (between the input words),
146
| and returns the properly rounded 64-bit integer corresponding to the input.
147
| If `zSign' is 1, the input is negated before being converted to an integer.
148
| Ordinarily, the fixed-point input is simply rounded to an integer, with
149
| the inexact exception raised if the input cannot be represented exactly as
150
| an integer.  However, if the fixed-point input is too large, the invalid
151
| exception is raised and the largest positive or negative integer is
152
| returned.
153
*----------------------------------------------------------------------------*/
154

    
155
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
156
{
157
    int8 roundingMode;
158
    flag roundNearestEven, increment;
159
    int64_t z;
160

    
161
    roundingMode = STATUS(float_rounding_mode);
162
    roundNearestEven = ( roundingMode == float_round_nearest_even );
163
    switch (roundingMode) {
164
    case float_round_nearest_even:
165
    case float_round_ties_away:
166
        increment = ((int64_t) absZ1 < 0);
167
        break;
168
    case float_round_to_zero:
169
        increment = 0;
170
        break;
171
    case float_round_up:
172
        increment = !zSign && absZ1;
173
        break;
174
    case float_round_down:
175
        increment = zSign && absZ1;
176
        break;
177
    default:
178
        abort();
179
    }
180
    if ( increment ) {
181
        ++absZ0;
182
        if ( absZ0 == 0 ) goto overflow;
183
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184
    }
185
    z = absZ0;
186
    if ( zSign ) z = - z;
187
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
188
 overflow:
189
        float_raise( float_flag_invalid STATUS_VAR);
190
        return
191
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
192
            : LIT64( 0x7FFFFFFFFFFFFFFF );
193
    }
194
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
195
    return z;
196

    
197
}
198

    
199
/*----------------------------------------------------------------------------
200
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201
| `absZ1', with binary point between bits 63 and 64 (between the input words),
202
| and returns the properly rounded 64-bit unsigned integer corresponding to the
203
| input.  Ordinarily, the fixed-point input is simply rounded to an integer,
204
| with the inexact exception raised if the input cannot be represented exactly
205
| as an integer.  However, if the fixed-point input is too large, the invalid
206
| exception is raised and the largest unsigned integer is returned.
207
*----------------------------------------------------------------------------*/
208

    
209
static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
210
                                uint64_t absZ1 STATUS_PARAM)
211
{
212
    int8 roundingMode;
213
    flag roundNearestEven, increment;
214

    
215
    roundingMode = STATUS(float_rounding_mode);
216
    roundNearestEven = (roundingMode == float_round_nearest_even);
217
    switch (roundingMode) {
218
    case float_round_nearest_even:
219
    case float_round_ties_away:
220
        increment = ((int64_t)absZ1 < 0);
221
        break;
222
    case float_round_to_zero:
223
        increment = 0;
224
        break;
225
    case float_round_up:
226
        increment = !zSign && absZ1;
227
        break;
228
    case float_round_down:
229
        increment = zSign && absZ1;
230
        break;
231
    default:
232
        abort();
233
    }
234
    if (increment) {
235
        ++absZ0;
236
        if (absZ0 == 0) {
237
            float_raise(float_flag_invalid STATUS_VAR);
238
            return LIT64(0xFFFFFFFFFFFFFFFF);
239
        }
240
        absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
241
    }
242

    
243
    if (zSign && absZ0) {
244
        float_raise(float_flag_invalid STATUS_VAR);
245
        return 0;
246
    }
247

    
248
    if (absZ1) {
249
        STATUS(float_exception_flags) |= float_flag_inexact;
250
    }
251
    return absZ0;
252
}
253

    
254
/*----------------------------------------------------------------------------
255
| Returns the fraction bits of the single-precision floating-point value `a'.
256
*----------------------------------------------------------------------------*/
257

    
258
INLINE uint32_t extractFloat32Frac( float32 a )
259
{
260

    
261
    return float32_val(a) & 0x007FFFFF;
262

    
263
}
264

    
265
/*----------------------------------------------------------------------------
266
| Returns the exponent bits of the single-precision floating-point value `a'.
267
*----------------------------------------------------------------------------*/
268

    
269
INLINE int_fast16_t extractFloat32Exp(float32 a)
270
{
271

    
272
    return ( float32_val(a)>>23 ) & 0xFF;
273

    
274
}
275

    
276
/*----------------------------------------------------------------------------
277
| Returns the sign bit of the single-precision floating-point value `a'.
278
*----------------------------------------------------------------------------*/
279

    
280
INLINE flag extractFloat32Sign( float32 a )
281
{
282

    
283
    return float32_val(a)>>31;
284

    
285
}
286

    
287
/*----------------------------------------------------------------------------
288
| If `a' is denormal and we are in flush-to-zero mode then set the
289
| input-denormal exception and return zero. Otherwise just return the value.
290
*----------------------------------------------------------------------------*/
291
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
292
{
293
    if (STATUS(flush_inputs_to_zero)) {
294
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
295
            float_raise(float_flag_input_denormal STATUS_VAR);
296
            return make_float32(float32_val(a) & 0x80000000);
297
        }
298
    }
299
    return a;
300
}
301

    
302
/*----------------------------------------------------------------------------
303
| Normalizes the subnormal single-precision floating-point value represented
304
| by the denormalized significand `aSig'.  The normalized exponent and
305
| significand are stored at the locations pointed to by `zExpPtr' and
306
| `zSigPtr', respectively.
307
*----------------------------------------------------------------------------*/
308

    
309
static void
310
 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
311
{
312
    int8 shiftCount;
313

    
314
    shiftCount = countLeadingZeros32( aSig ) - 8;
315
    *zSigPtr = aSig<<shiftCount;
316
    *zExpPtr = 1 - shiftCount;
317

    
318
}
319

    
320
/*----------------------------------------------------------------------------
321
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322
| single-precision floating-point value, returning the result.  After being
323
| shifted into the proper positions, the three fields are simply added
324
| together to form the result.  This means that any integer portion of `zSig'
325
| will be added into the exponent.  Since a properly normalized significand
326
| will have an integer portion equal to 1, the `zExp' input should be 1 less
327
| than the desired result exponent whenever `zSig' is a complete, normalized
328
| significand.
329
*----------------------------------------------------------------------------*/
330

    
331
INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
332
{
333

    
334
    return make_float32(
335
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
336

    
337
}
338

    
339
/*----------------------------------------------------------------------------
340
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341
| and significand `zSig', and returns the proper single-precision floating-
342
| point value corresponding to the abstract input.  Ordinarily, the abstract
343
| value is simply rounded and packed into the single-precision format, with
344
| the inexact exception raised if the abstract input cannot be represented
345
| exactly.  However, if the abstract value is too large, the overflow and
346
| inexact exceptions are raised and an infinity or maximal finite value is
347
| returned.  If the abstract value is too small, the input value is rounded to
348
| a subnormal number, and the underflow and inexact exceptions are raised if
349
| the abstract input cannot be represented exactly as a subnormal single-
350
| precision floating-point number.
351
|     The input significand `zSig' has its binary point between bits 30
352
| and 29, which is 7 bits to the left of the usual location.  This shifted
353
| significand must be normalized or smaller.  If `zSig' is not normalized,
354
| `zExp' must be 0; in that case, the result returned is a subnormal number,
355
| and it must not require rounding.  In the usual case that `zSig' is
356
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357
| The handling of underflow and overflow follows the IEC/IEEE Standard for
358
| Binary Floating-Point Arithmetic.
359
*----------------------------------------------------------------------------*/
360

    
361
static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
362
{
363
    int8 roundingMode;
364
    flag roundNearestEven;
365
    int8 roundIncrement, roundBits;
366
    flag isTiny;
367

    
368
    roundingMode = STATUS(float_rounding_mode);
369
    roundNearestEven = ( roundingMode == float_round_nearest_even );
370
    switch (roundingMode) {
371
    case float_round_nearest_even:
372
    case float_round_ties_away:
373
        roundIncrement = 0x40;
374
        break;
375
    case float_round_to_zero:
376
        roundIncrement = 0;
377
        break;
378
    case float_round_up:
379
        roundIncrement = zSign ? 0 : 0x7f;
380
        break;
381
    case float_round_down:
382
        roundIncrement = zSign ? 0x7f : 0;
383
        break;
384
    default:
385
        abort();
386
        break;
387
    }
388
    roundBits = zSig & 0x7F;
389
    if ( 0xFD <= (uint16_t) zExp ) {
390
        if (    ( 0xFD < zExp )
391
             || (    ( zExp == 0xFD )
392
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393
           ) {
394
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
395
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
396
        }
397
        if ( zExp < 0 ) {
398
            if (STATUS(flush_to_zero)) {
399
                float_raise(float_flag_output_denormal STATUS_VAR);
400
                return packFloat32(zSign, 0, 0);
401
            }
402
            isTiny =
403
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404
                || ( zExp < -1 )
405
                || ( zSig + roundIncrement < 0x80000000 );
406
            shift32RightJamming( zSig, - zExp, &zSig );
407
            zExp = 0;
408
            roundBits = zSig & 0x7F;
409
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
410
        }
411
    }
412
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
413
    zSig = ( zSig + roundIncrement )>>7;
414
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
415
    if ( zSig == 0 ) zExp = 0;
416
    return packFloat32( zSign, zExp, zSig );
417

    
418
}
419

    
420
/*----------------------------------------------------------------------------
421
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422
| and significand `zSig', and returns the proper single-precision floating-
423
| point value corresponding to the abstract input.  This routine is just like
424
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426
| floating-point exponent.
427
*----------------------------------------------------------------------------*/
428

    
429
static float32
430
 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
431
{
432
    int8 shiftCount;
433

    
434
    shiftCount = countLeadingZeros32( zSig ) - 1;
435
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
436

    
437
}
438

    
439
/*----------------------------------------------------------------------------
440
| Returns the fraction bits of the double-precision floating-point value `a'.
441
*----------------------------------------------------------------------------*/
442

    
443
INLINE uint64_t extractFloat64Frac( float64 a )
444
{
445

    
446
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
447

    
448
}
449

    
450
/*----------------------------------------------------------------------------
451
| Returns the exponent bits of the double-precision floating-point value `a'.
452
*----------------------------------------------------------------------------*/
453

    
454
INLINE int_fast16_t extractFloat64Exp(float64 a)
455
{
456

    
457
    return ( float64_val(a)>>52 ) & 0x7FF;
458

    
459
}
460

    
461
/*----------------------------------------------------------------------------
462
| Returns the sign bit of the double-precision floating-point value `a'.
463
*----------------------------------------------------------------------------*/
464

    
465
INLINE flag extractFloat64Sign( float64 a )
466
{
467

    
468
    return float64_val(a)>>63;
469

    
470
}
471

    
472
/*----------------------------------------------------------------------------
473
| If `a' is denormal and we are in flush-to-zero mode then set the
474
| input-denormal exception and return zero. Otherwise just return the value.
475
*----------------------------------------------------------------------------*/
476
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477
{
478
    if (STATUS(flush_inputs_to_zero)) {
479
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
480
            float_raise(float_flag_input_denormal STATUS_VAR);
481
            return make_float64(float64_val(a) & (1ULL << 63));
482
        }
483
    }
484
    return a;
485
}
486

    
487
/*----------------------------------------------------------------------------
488
| Normalizes the subnormal double-precision floating-point value represented
489
| by the denormalized significand `aSig'.  The normalized exponent and
490
| significand are stored at the locations pointed to by `zExpPtr' and
491
| `zSigPtr', respectively.
492
*----------------------------------------------------------------------------*/
493

    
494
static void
495
 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
496
{
497
    int8 shiftCount;
498

    
499
    shiftCount = countLeadingZeros64( aSig ) - 11;
500
    *zSigPtr = aSig<<shiftCount;
501
    *zExpPtr = 1 - shiftCount;
502

    
503
}
504

    
505
/*----------------------------------------------------------------------------
506
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507
| double-precision floating-point value, returning the result.  After being
508
| shifted into the proper positions, the three fields are simply added
509
| together to form the result.  This means that any integer portion of `zSig'
510
| will be added into the exponent.  Since a properly normalized significand
511
| will have an integer portion equal to 1, the `zExp' input should be 1 less
512
| than the desired result exponent whenever `zSig' is a complete, normalized
513
| significand.
514
*----------------------------------------------------------------------------*/
515

    
516
INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
517
{
518

    
519
    return make_float64(
520
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
521

    
522
}
523

    
524
/*----------------------------------------------------------------------------
525
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526
| and significand `zSig', and returns the proper double-precision floating-
527
| point value corresponding to the abstract input.  Ordinarily, the abstract
528
| value is simply rounded and packed into the double-precision format, with
529
| the inexact exception raised if the abstract input cannot be represented
530
| exactly.  However, if the abstract value is too large, the overflow and
531
| inexact exceptions are raised and an infinity or maximal finite value is
532
| returned.  If the abstract value is too small, the input value is rounded
533
| to a subnormal number, and the underflow and inexact exceptions are raised
534
| if the abstract input cannot be represented exactly as a subnormal double-
535
| precision floating-point number.
536
|     The input significand `zSig' has its binary point between bits 62
537
| and 61, which is 10 bits to the left of the usual location.  This shifted
538
| significand must be normalized or smaller.  If `zSig' is not normalized,
539
| `zExp' must be 0; in that case, the result returned is a subnormal number,
540
| and it must not require rounding.  In the usual case that `zSig' is
541
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542
| The handling of underflow and overflow follows the IEC/IEEE Standard for
543
| Binary Floating-Point Arithmetic.
544
*----------------------------------------------------------------------------*/
545

    
546
static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
547
{
548
    int8 roundingMode;
549
    flag roundNearestEven;
550
    int_fast16_t roundIncrement, roundBits;
551
    flag isTiny;
552

    
553
    roundingMode = STATUS(float_rounding_mode);
554
    roundNearestEven = ( roundingMode == float_round_nearest_even );
555
    switch (roundingMode) {
556
    case float_round_nearest_even:
557
    case float_round_ties_away:
558
        roundIncrement = 0x200;
559
        break;
560
    case float_round_to_zero:
561
        roundIncrement = 0;
562
        break;
563
    case float_round_up:
564
        roundIncrement = zSign ? 0 : 0x3ff;
565
        break;
566
    case float_round_down:
567
        roundIncrement = zSign ? 0x3ff : 0;
568
        break;
569
    default:
570
        abort();
571
    }
572
    roundBits = zSig & 0x3FF;
573
    if ( 0x7FD <= (uint16_t) zExp ) {
574
        if (    ( 0x7FD < zExp )
575
             || (    ( zExp == 0x7FD )
576
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
577
           ) {
578
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
579
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
580
        }
581
        if ( zExp < 0 ) {
582
            if (STATUS(flush_to_zero)) {
583
                float_raise(float_flag_output_denormal STATUS_VAR);
584
                return packFloat64(zSign, 0, 0);
585
            }
586
            isTiny =
587
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
588
                || ( zExp < -1 )
589
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
590
            shift64RightJamming( zSig, - zExp, &zSig );
591
            zExp = 0;
592
            roundBits = zSig & 0x3FF;
593
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
594
        }
595
    }
596
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
597
    zSig = ( zSig + roundIncrement )>>10;
598
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
599
    if ( zSig == 0 ) zExp = 0;
600
    return packFloat64( zSign, zExp, zSig );
601

    
602
}
603

    
604
/*----------------------------------------------------------------------------
605
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606
| and significand `zSig', and returns the proper double-precision floating-
607
| point value corresponding to the abstract input.  This routine is just like
608
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610
| floating-point exponent.
611
*----------------------------------------------------------------------------*/
612

    
613
static float64
614
 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615
{
616
    int8 shiftCount;
617

    
618
    shiftCount = countLeadingZeros64( zSig ) - 1;
619
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620

    
621
}
622

    
623
/*----------------------------------------------------------------------------
624
| Returns the fraction bits of the extended double-precision floating-point
625
| value `a'.
626
*----------------------------------------------------------------------------*/
627

    
628
INLINE uint64_t extractFloatx80Frac( floatx80 a )
629
{
630

    
631
    return a.low;
632

    
633
}
634

    
635
/*----------------------------------------------------------------------------
636
| Returns the exponent bits of the extended double-precision floating-point
637
| value `a'.
638
*----------------------------------------------------------------------------*/
639

    
640
INLINE int32 extractFloatx80Exp( floatx80 a )
641
{
642

    
643
    return a.high & 0x7FFF;
644

    
645
}
646

    
647
/*----------------------------------------------------------------------------
648
| Returns the sign bit of the extended double-precision floating-point value
649
| `a'.
650
*----------------------------------------------------------------------------*/
651

    
652
INLINE flag extractFloatx80Sign( floatx80 a )
653
{
654

    
655
    return a.high>>15;
656

    
657
}
658

    
659
/*----------------------------------------------------------------------------
660
| Normalizes the subnormal extended double-precision floating-point value
661
| represented by the denormalized significand `aSig'.  The normalized exponent
662
| and significand are stored at the locations pointed to by `zExpPtr' and
663
| `zSigPtr', respectively.
664
*----------------------------------------------------------------------------*/
665

    
666
static void
667
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668
{
669
    int8 shiftCount;
670

    
671
    shiftCount = countLeadingZeros64( aSig );
672
    *zSigPtr = aSig<<shiftCount;
673
    *zExpPtr = 1 - shiftCount;
674

    
675
}
676

    
677
/*----------------------------------------------------------------------------
678
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679
| extended double-precision floating-point value, returning the result.
680
*----------------------------------------------------------------------------*/
681

    
682
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
683
{
684
    floatx80 z;
685

    
686
    z.low = zSig;
687
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
688
    return z;
689

    
690
}
691

    
692
/*----------------------------------------------------------------------------
693
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
695
| and returns the proper extended double-precision floating-point value
696
| corresponding to the abstract input.  Ordinarily, the abstract value is
697
| rounded and packed into the extended double-precision format, with the
698
| inexact exception raised if the abstract input cannot be represented
699
| exactly.  However, if the abstract value is too large, the overflow and
700
| inexact exceptions are raised and an infinity or maximal finite value is
701
| returned.  If the abstract value is too small, the input value is rounded to
702
| a subnormal number, and the underflow and inexact exceptions are raised if
703
| the abstract input cannot be represented exactly as a subnormal extended
704
| double-precision floating-point number.
705
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
706
| number of bits as single or double precision, respectively.  Otherwise, the
707
| result is rounded to the full precision of the extended double-precision
708
| format.
709
|     The input significand must be normalized or smaller.  If the input
710
| significand is not normalized, `zExp' must be 0; in that case, the result
711
| returned is a subnormal number, and it must not require rounding.  The
712
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713
| Floating-Point Arithmetic.
714
*----------------------------------------------------------------------------*/
715

    
716
static floatx80
717
 roundAndPackFloatx80(
718
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719
 STATUS_PARAM)
720
{
721
    int8 roundingMode;
722
    flag roundNearestEven, increment, isTiny;
723
    int64 roundIncrement, roundMask, roundBits;
724

    
725
    roundingMode = STATUS(float_rounding_mode);
726
    roundNearestEven = ( roundingMode == float_round_nearest_even );
727
    if ( roundingPrecision == 80 ) goto precision80;
728
    if ( roundingPrecision == 64 ) {
729
        roundIncrement = LIT64( 0x0000000000000400 );
730
        roundMask = LIT64( 0x00000000000007FF );
731
    }
732
    else if ( roundingPrecision == 32 ) {
733
        roundIncrement = LIT64( 0x0000008000000000 );
734
        roundMask = LIT64( 0x000000FFFFFFFFFF );
735
    }
736
    else {
737
        goto precision80;
738
    }
739
    zSig0 |= ( zSig1 != 0 );
740
    switch (roundingMode) {
741
    case float_round_nearest_even:
742
    case float_round_ties_away:
743
        break;
744
    case float_round_to_zero:
745
        roundIncrement = 0;
746
        break;
747
    case float_round_up:
748
        roundIncrement = zSign ? 0 : roundMask;
749
        break;
750
    case float_round_down:
751
        roundIncrement = zSign ? roundMask : 0;
752
        break;
753
    default:
754
        abort();
755
    }
756
    roundBits = zSig0 & roundMask;
757
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
758
        if (    ( 0x7FFE < zExp )
759
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
760
           ) {
761
            goto overflow;
762
        }
763
        if ( zExp <= 0 ) {
764
            if (STATUS(flush_to_zero)) {
765
                float_raise(float_flag_output_denormal STATUS_VAR);
766
                return packFloatx80(zSign, 0, 0);
767
            }
768
            isTiny =
769
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
770
                || ( zExp < 0 )
771
                || ( zSig0 <= zSig0 + roundIncrement );
772
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
773
            zExp = 0;
774
            roundBits = zSig0 & roundMask;
775
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
776
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
777
            zSig0 += roundIncrement;
778
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
779
            roundIncrement = roundMask + 1;
780
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
781
                roundMask |= roundIncrement;
782
            }
783
            zSig0 &= ~ roundMask;
784
            return packFloatx80( zSign, zExp, zSig0 );
785
        }
786
    }
787
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
788
    zSig0 += roundIncrement;
789
    if ( zSig0 < roundIncrement ) {
790
        ++zExp;
791
        zSig0 = LIT64( 0x8000000000000000 );
792
    }
793
    roundIncrement = roundMask + 1;
794
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
795
        roundMask |= roundIncrement;
796
    }
797
    zSig0 &= ~ roundMask;
798
    if ( zSig0 == 0 ) zExp = 0;
799
    return packFloatx80( zSign, zExp, zSig0 );
800
 precision80:
801
    switch (roundingMode) {
802
    case float_round_nearest_even:
803
    case float_round_ties_away:
804
        increment = ((int64_t)zSig1 < 0);
805
        break;
806
    case float_round_to_zero:
807
        increment = 0;
808
        break;
809
    case float_round_up:
810
        increment = !zSign && zSig1;
811
        break;
812
    case float_round_down:
813
        increment = zSign && zSig1;
814
        break;
815
    default:
816
        abort();
817
    }
818
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
819
        if (    ( 0x7FFE < zExp )
820
             || (    ( zExp == 0x7FFE )
821
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
822
                  && increment
823
                )
824
           ) {
825
            roundMask = 0;
826
 overflow:
827
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
828
            if (    ( roundingMode == float_round_to_zero )
829
                 || ( zSign && ( roundingMode == float_round_up ) )
830
                 || ( ! zSign && ( roundingMode == float_round_down ) )
831
               ) {
832
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
833
            }
834
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
835
        }
836
        if ( zExp <= 0 ) {
837
            isTiny =
838
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
839
                || ( zExp < 0 )
840
                || ! increment
841
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
842
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
843
            zExp = 0;
844
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
845
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
846
            switch (roundingMode) {
847
            case float_round_nearest_even:
848
            case float_round_ties_away:
849
                increment = ((int64_t)zSig1 < 0);
850
                break;
851
            case float_round_to_zero:
852
                increment = 0;
853
                break;
854
            case float_round_up:
855
                increment = !zSign && zSig1;
856
                break;
857
            case float_round_down:
858
                increment = zSign && zSig1;
859
                break;
860
            default:
861
                abort();
862
            }
863
            if ( increment ) {
864
                ++zSig0;
865
                zSig0 &=
866
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
868
            }
869
            return packFloatx80( zSign, zExp, zSig0 );
870
        }
871
    }
872
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
873
    if ( increment ) {
874
        ++zSig0;
875
        if ( zSig0 == 0 ) {
876
            ++zExp;
877
            zSig0 = LIT64( 0x8000000000000000 );
878
        }
879
        else {
880
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
881
        }
882
    }
883
    else {
884
        if ( zSig0 == 0 ) zExp = 0;
885
    }
886
    return packFloatx80( zSign, zExp, zSig0 );
887

    
888
}
889

    
890
/*----------------------------------------------------------------------------
891
| Takes an abstract floating-point value having sign `zSign', exponent
892
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893
| and returns the proper extended double-precision floating-point value
894
| corresponding to the abstract input.  This routine is just like
895
| `roundAndPackFloatx80' except that the input significand does not have to be
896
| normalized.
897
*----------------------------------------------------------------------------*/
898

    
899
static floatx80
900
 normalizeRoundAndPackFloatx80(
901
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
902
 STATUS_PARAM)
903
{
904
    int8 shiftCount;
905

    
906
    if ( zSig0 == 0 ) {
907
        zSig0 = zSig1;
908
        zSig1 = 0;
909
        zExp -= 64;
910
    }
911
    shiftCount = countLeadingZeros64( zSig0 );
912
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
913
    zExp -= shiftCount;
914
    return
915
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
916

    
917
}
918

    
919
/*----------------------------------------------------------------------------
920
| Returns the least-significant 64 fraction bits of the quadruple-precision
921
| floating-point value `a'.
922
*----------------------------------------------------------------------------*/
923

    
924
INLINE uint64_t extractFloat128Frac1( float128 a )
925
{
926

    
927
    return a.low;
928

    
929
}
930

    
931
/*----------------------------------------------------------------------------
932
| Returns the most-significant 48 fraction bits of the quadruple-precision
933
| floating-point value `a'.
934
*----------------------------------------------------------------------------*/
935

    
936
INLINE uint64_t extractFloat128Frac0( float128 a )
937
{
938

    
939
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
940

    
941
}
942

    
943
/*----------------------------------------------------------------------------
944
| Returns the exponent bits of the quadruple-precision floating-point value
945
| `a'.
946
*----------------------------------------------------------------------------*/
947

    
948
INLINE int32 extractFloat128Exp( float128 a )
949
{
950

    
951
    return ( a.high>>48 ) & 0x7FFF;
952

    
953
}
954

    
955
/*----------------------------------------------------------------------------
956
| Returns the sign bit of the quadruple-precision floating-point value `a'.
957
*----------------------------------------------------------------------------*/
958

    
959
INLINE flag extractFloat128Sign( float128 a )
960
{
961

    
962
    return a.high>>63;
963

    
964
}
965

    
966
/*----------------------------------------------------------------------------
967
| Normalizes the subnormal quadruple-precision floating-point value
968
| represented by the denormalized significand formed by the concatenation of
969
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
970
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
971
| significand are stored at the location pointed to by `zSig0Ptr', and the
972
| least significant 64 bits of the normalized significand are stored at the
973
| location pointed to by `zSig1Ptr'.
974
*----------------------------------------------------------------------------*/
975

    
976
static void
977
 normalizeFloat128Subnormal(
978
     uint64_t aSig0,
979
     uint64_t aSig1,
980
     int32 *zExpPtr,
981
     uint64_t *zSig0Ptr,
982
     uint64_t *zSig1Ptr
983
 )
984
{
985
    int8 shiftCount;
986

    
987
    if ( aSig0 == 0 ) {
988
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
989
        if ( shiftCount < 0 ) {
990
            *zSig0Ptr = aSig1>>( - shiftCount );
991
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
992
        }
993
        else {
994
            *zSig0Ptr = aSig1<<shiftCount;
995
            *zSig1Ptr = 0;
996
        }
997
        *zExpPtr = - shiftCount - 63;
998
    }
999
    else {
1000
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002
        *zExpPtr = 1 - shiftCount;
1003
    }
1004

    
1005
}
1006

    
1007
/*----------------------------------------------------------------------------
1008
| Packs the sign `zSign', the exponent `zExp', and the significand formed
1009
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010
| floating-point value, returning the result.  After being shifted into the
1011
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012
| added together to form the most significant 32 bits of the result.  This
1013
| means that any integer portion of `zSig0' will be added into the exponent.
1014
| Since a properly normalized significand will have an integer portion equal
1015
| to 1, the `zExp' input should be 1 less than the desired result exponent
1016
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017
| significand.
1018
*----------------------------------------------------------------------------*/
1019

    
1020
INLINE float128
1021
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1022
{
1023
    float128 z;
1024

    
1025
    z.low = zSig1;
1026
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1027
    return z;
1028

    
1029
}
1030

    
1031
/*----------------------------------------------------------------------------
1032
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033
| and extended significand formed by the concatenation of `zSig0', `zSig1',
1034
| and `zSig2', and returns the proper quadruple-precision floating-point value
1035
| corresponding to the abstract input.  Ordinarily, the abstract value is
1036
| simply rounded and packed into the quadruple-precision format, with the
1037
| inexact exception raised if the abstract input cannot be represented
1038
| exactly.  However, if the abstract value is too large, the overflow and
1039
| inexact exceptions are raised and an infinity or maximal finite value is
1040
| returned.  If the abstract value is too small, the input value is rounded to
1041
| a subnormal number, and the underflow and inexact exceptions are raised if
1042
| the abstract input cannot be represented exactly as a subnormal quadruple-
1043
| precision floating-point number.
1044
|     The input significand must be normalized or smaller.  If the input
1045
| significand is not normalized, `zExp' must be 0; in that case, the result
1046
| returned is a subnormal number, and it must not require rounding.  In the
1047
| usual case that the input significand is normalized, `zExp' must be 1 less
1048
| than the ``true'' floating-point exponent.  The handling of underflow and
1049
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050
*----------------------------------------------------------------------------*/
1051

    
1052
static float128
1053
 roundAndPackFloat128(
1054
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1055
{
1056
    int8 roundingMode;
1057
    flag roundNearestEven, increment, isTiny;
1058

    
1059
    roundingMode = STATUS(float_rounding_mode);
1060
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1061
    switch (roundingMode) {
1062
    case float_round_nearest_even:
1063
    case float_round_ties_away:
1064
        increment = ((int64_t)zSig2 < 0);
1065
        break;
1066
    case float_round_to_zero:
1067
        increment = 0;
1068
        break;
1069
    case float_round_up:
1070
        increment = !zSign && zSig2;
1071
        break;
1072
    case float_round_down:
1073
        increment = zSign && zSig2;
1074
        break;
1075
    default:
1076
        abort();
1077
    }
1078
    if ( 0x7FFD <= (uint32_t) zExp ) {
1079
        if (    ( 0x7FFD < zExp )
1080
             || (    ( zExp == 0x7FFD )
1081
                  && eq128(
1082
                         LIT64( 0x0001FFFFFFFFFFFF ),
1083
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1084
                         zSig0,
1085
                         zSig1
1086
                     )
1087
                  && increment
1088
                )
1089
           ) {
1090
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091
            if (    ( roundingMode == float_round_to_zero )
1092
                 || ( zSign && ( roundingMode == float_round_up ) )
1093
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1094
               ) {
1095
                return
1096
                    packFloat128(
1097
                        zSign,
1098
                        0x7FFE,
1099
                        LIT64( 0x0000FFFFFFFFFFFF ),
1100
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1101
                    );
1102
            }
1103
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1104
        }
1105
        if ( zExp < 0 ) {
1106
            if (STATUS(flush_to_zero)) {
1107
                float_raise(float_flag_output_denormal STATUS_VAR);
1108
                return packFloat128(zSign, 0, 0, 0);
1109
            }
1110
            isTiny =
1111
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112
                || ( zExp < -1 )
1113
                || ! increment
1114
                || lt128(
1115
                       zSig0,
1116
                       zSig1,
1117
                       LIT64( 0x0001FFFFFFFFFFFF ),
1118
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1119
                   );
1120
            shift128ExtraRightJamming(
1121
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122
            zExp = 0;
1123
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1124
            switch (roundingMode) {
1125
            case float_round_nearest_even:
1126
            case float_round_ties_away:
1127
                increment = ((int64_t)zSig2 < 0);
1128
                break;
1129
            case float_round_to_zero:
1130
                increment = 0;
1131
                break;
1132
            case float_round_up:
1133
                increment = !zSign && zSig2;
1134
                break;
1135
            case float_round_down:
1136
                increment = zSign && zSig2;
1137
                break;
1138
            default:
1139
                abort();
1140
            }
1141
        }
1142
    }
1143
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144
    if ( increment ) {
1145
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147
    }
1148
    else {
1149
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150
    }
1151
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1152

    
1153
}
1154

    
1155
/*----------------------------------------------------------------------------
1156
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1158
| returns the proper quadruple-precision floating-point value corresponding
1159
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1160
| except that the input significand has fewer bits and does not have to be
1161
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1162
| point exponent.
1163
*----------------------------------------------------------------------------*/
1164

    
1165
static float128
1166
 normalizeRoundAndPackFloat128(
1167
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1168
{
1169
    int8 shiftCount;
1170
    uint64_t zSig2;
1171

    
1172
    if ( zSig0 == 0 ) {
1173
        zSig0 = zSig1;
1174
        zSig1 = 0;
1175
        zExp -= 64;
1176
    }
1177
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178
    if ( 0 <= shiftCount ) {
1179
        zSig2 = 0;
1180
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181
    }
1182
    else {
1183
        shift128ExtraRightJamming(
1184
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185
    }
1186
    zExp -= shiftCount;
1187
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188

    
1189
}
1190

    
1191
/*----------------------------------------------------------------------------
1192
| Returns the result of converting the 32-bit two's complement integer `a'
1193
| to the single-precision floating-point format.  The conversion is performed
1194
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195
*----------------------------------------------------------------------------*/
1196

    
1197
float32 int32_to_float32(int32_t a STATUS_PARAM)
1198
{
1199
    flag zSign;
1200

    
1201
    if ( a == 0 ) return float32_zero;
1202
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1203
    zSign = ( a < 0 );
1204
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205

    
1206
}
1207

    
1208
/*----------------------------------------------------------------------------
1209
| Returns the result of converting the 32-bit two's complement integer `a'
1210
| to the double-precision floating-point format.  The conversion is performed
1211
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212
*----------------------------------------------------------------------------*/
1213

    
1214
float64 int32_to_float64(int32_t a STATUS_PARAM)
1215
{
1216
    flag zSign;
1217
    uint32 absA;
1218
    int8 shiftCount;
1219
    uint64_t zSig;
1220

    
1221
    if ( a == 0 ) return float64_zero;
1222
    zSign = ( a < 0 );
1223
    absA = zSign ? - a : a;
1224
    shiftCount = countLeadingZeros32( absA ) + 21;
1225
    zSig = absA;
1226
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227

    
1228
}
1229

    
1230
/*----------------------------------------------------------------------------
1231
| Returns the result of converting the 32-bit two's complement integer `a'
1232
| to the extended double-precision floating-point format.  The conversion
1233
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234
| Arithmetic.
1235
*----------------------------------------------------------------------------*/
1236

    
1237
floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1238
{
1239
    flag zSign;
1240
    uint32 absA;
1241
    int8 shiftCount;
1242
    uint64_t zSig;
1243

    
1244
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245
    zSign = ( a < 0 );
1246
    absA = zSign ? - a : a;
1247
    shiftCount = countLeadingZeros32( absA ) + 32;
1248
    zSig = absA;
1249
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250

    
1251
}
1252

    
1253
/*----------------------------------------------------------------------------
1254
| Returns the result of converting the 32-bit two's complement integer `a' to
1255
| the quadruple-precision floating-point format.  The conversion is performed
1256
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257
*----------------------------------------------------------------------------*/
1258

    
1259
float128 int32_to_float128(int32_t a STATUS_PARAM)
1260
{
1261
    flag zSign;
1262
    uint32 absA;
1263
    int8 shiftCount;
1264
    uint64_t zSig0;
1265

    
1266
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267
    zSign = ( a < 0 );
1268
    absA = zSign ? - a : a;
1269
    shiftCount = countLeadingZeros32( absA ) + 17;
1270
    zSig0 = absA;
1271
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272

    
1273
}
1274

    
1275
/*----------------------------------------------------------------------------
1276
| Returns the result of converting the 64-bit two's complement integer `a'
1277
| to the single-precision floating-point format.  The conversion is performed
1278
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279
*----------------------------------------------------------------------------*/
1280

    
1281
float32 int64_to_float32(int64_t a STATUS_PARAM)
1282
{
1283
    flag zSign;
1284
    uint64 absA;
1285
    int8 shiftCount;
1286

    
1287
    if ( a == 0 ) return float32_zero;
1288
    zSign = ( a < 0 );
1289
    absA = zSign ? - a : a;
1290
    shiftCount = countLeadingZeros64( absA ) - 40;
1291
    if ( 0 <= shiftCount ) {
1292
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293
    }
1294
    else {
1295
        shiftCount += 7;
1296
        if ( shiftCount < 0 ) {
1297
            shift64RightJamming( absA, - shiftCount, &absA );
1298
        }
1299
        else {
1300
            absA <<= shiftCount;
1301
        }
1302
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303
    }
1304

    
1305
}
1306

    
1307
float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1308
{
1309
    int8 shiftCount;
1310

    
1311
    if ( a == 0 ) return float32_zero;
1312
    shiftCount = countLeadingZeros64( a ) - 40;
1313
    if ( 0 <= shiftCount ) {
1314
        return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1315
    }
1316
    else {
1317
        shiftCount += 7;
1318
        if ( shiftCount < 0 ) {
1319
            shift64RightJamming( a, - shiftCount, &a );
1320
        }
1321
        else {
1322
            a <<= shiftCount;
1323
        }
1324
        return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1325
    }
1326
}
1327

    
1328
/*----------------------------------------------------------------------------
1329
| Returns the result of converting the 64-bit two's complement integer `a'
1330
| to the double-precision floating-point format.  The conversion is performed
1331
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332
*----------------------------------------------------------------------------*/
1333

    
1334
float64 int64_to_float64(int64_t a STATUS_PARAM)
1335
{
1336
    flag zSign;
1337

    
1338
    if ( a == 0 ) return float64_zero;
1339
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1340
        return packFloat64( 1, 0x43E, 0 );
1341
    }
1342
    zSign = ( a < 0 );
1343
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344

    
1345
}
1346

    
1347
float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1348
{
1349
    int exp =  0x43C;
1350

    
1351
    if (a == 0) {
1352
        return float64_zero;
1353
    }
1354
    if ((int64_t)a < 0) {
1355
        shift64RightJamming(a, 1, &a);
1356
        exp += 1;
1357
    }
1358
    return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1359
}
1360

    
1361
/*----------------------------------------------------------------------------
1362
| Returns the result of converting the 64-bit two's complement integer `a'
1363
| to the extended double-precision floating-point format.  The conversion
1364
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365
| Arithmetic.
1366
*----------------------------------------------------------------------------*/
1367

    
1368
floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1369
{
1370
    flag zSign;
1371
    uint64 absA;
1372
    int8 shiftCount;
1373

    
1374
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375
    zSign = ( a < 0 );
1376
    absA = zSign ? - a : a;
1377
    shiftCount = countLeadingZeros64( absA );
1378
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379

    
1380
}
1381

    
1382
/*----------------------------------------------------------------------------
1383
| Returns the result of converting the 64-bit two's complement integer `a' to
1384
| the quadruple-precision floating-point format.  The conversion is performed
1385
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386
*----------------------------------------------------------------------------*/
1387

    
1388
float128 int64_to_float128(int64_t a STATUS_PARAM)
1389
{
1390
    flag zSign;
1391
    uint64 absA;
1392
    int8 shiftCount;
1393
    int32 zExp;
1394
    uint64_t zSig0, zSig1;
1395

    
1396
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397
    zSign = ( a < 0 );
1398
    absA = zSign ? - a : a;
1399
    shiftCount = countLeadingZeros64( absA ) + 49;
1400
    zExp = 0x406E - shiftCount;
1401
    if ( 64 <= shiftCount ) {
1402
        zSig1 = 0;
1403
        zSig0 = absA;
1404
        shiftCount -= 64;
1405
    }
1406
    else {
1407
        zSig1 = absA;
1408
        zSig0 = 0;
1409
    }
1410
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1412

    
1413
}
1414

    
1415
float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1416
{
1417
    if (a == 0) {
1418
        return float128_zero;
1419
    }
1420
    return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421
}
1422

    
1423
/*----------------------------------------------------------------------------
1424
| Returns the result of converting the single-precision floating-point value
1425
| `a' to the 32-bit two's complement integer format.  The conversion is
1426
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1427
| Arithmetic---which means in particular that the conversion is rounded
1428
| according to the current rounding mode.  If `a' is a NaN, the largest
1429
| positive integer is returned.  Otherwise, if the conversion overflows, the
1430
| largest integer with the same sign as `a' is returned.
1431
*----------------------------------------------------------------------------*/
1432

    
1433
int32 float32_to_int32( float32 a STATUS_PARAM )
1434
{
1435
    flag aSign;
1436
    int_fast16_t aExp, shiftCount;
1437
    uint32_t aSig;
1438
    uint64_t aSig64;
1439

    
1440
    a = float32_squash_input_denormal(a STATUS_VAR);
1441
    aSig = extractFloat32Frac( a );
1442
    aExp = extractFloat32Exp( a );
1443
    aSign = extractFloat32Sign( a );
1444
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445
    if ( aExp ) aSig |= 0x00800000;
1446
    shiftCount = 0xAF - aExp;
1447
    aSig64 = aSig;
1448
    aSig64 <<= 32;
1449
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451

    
1452
}
1453

    
1454
/*----------------------------------------------------------------------------
1455
| Returns the result of converting the single-precision floating-point value
1456
| `a' to the 32-bit two's complement integer format.  The conversion is
1457
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1458
| Arithmetic, except that the conversion is always rounded toward zero.
1459
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1460
| the conversion overflows, the largest integer with the same sign as `a' is
1461
| returned.
1462
*----------------------------------------------------------------------------*/
1463

    
1464
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465
{
1466
    flag aSign;
1467
    int_fast16_t aExp, shiftCount;
1468
    uint32_t aSig;
1469
    int32_t z;
1470
    a = float32_squash_input_denormal(a STATUS_VAR);
1471

    
1472
    aSig = extractFloat32Frac( a );
1473
    aExp = extractFloat32Exp( a );
1474
    aSign = extractFloat32Sign( a );
1475
    shiftCount = aExp - 0x9E;
1476
    if ( 0 <= shiftCount ) {
1477
        if ( float32_val(a) != 0xCF000000 ) {
1478
            float_raise( float_flag_invalid STATUS_VAR);
1479
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480
        }
1481
        return (int32_t) 0x80000000;
1482
    }
1483
    else if ( aExp <= 0x7E ) {
1484
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485
        return 0;
1486
    }
1487
    aSig = ( aSig | 0x00800000 )<<8;
1488
    z = aSig>>( - shiftCount );
1489
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1490
        STATUS(float_exception_flags) |= float_flag_inexact;
1491
    }
1492
    if ( aSign ) z = - z;
1493
    return z;
1494

    
1495
}
1496

    
1497
/*----------------------------------------------------------------------------
1498
| Returns the result of converting the single-precision floating-point value
1499
| `a' to the 16-bit two's complement integer format.  The conversion is
1500
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1501
| Arithmetic, except that the conversion is always rounded toward zero.
1502
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1503
| the conversion overflows, the largest integer with the same sign as `a' is
1504
| returned.
1505
*----------------------------------------------------------------------------*/
1506

    
1507
int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1508
{
1509
    flag aSign;
1510
    int_fast16_t aExp, shiftCount;
1511
    uint32_t aSig;
1512
    int32 z;
1513

    
1514
    aSig = extractFloat32Frac( a );
1515
    aExp = extractFloat32Exp( a );
1516
    aSign = extractFloat32Sign( a );
1517
    shiftCount = aExp - 0x8E;
1518
    if ( 0 <= shiftCount ) {
1519
        if ( float32_val(a) != 0xC7000000 ) {
1520
            float_raise( float_flag_invalid STATUS_VAR);
1521
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522
                return 0x7FFF;
1523
            }
1524
        }
1525
        return (int32_t) 0xffff8000;
1526
    }
1527
    else if ( aExp <= 0x7E ) {
1528
        if ( aExp | aSig ) {
1529
            STATUS(float_exception_flags) |= float_flag_inexact;
1530
        }
1531
        return 0;
1532
    }
1533
    shiftCount -= 0x10;
1534
    aSig = ( aSig | 0x00800000 )<<8;
1535
    z = aSig>>( - shiftCount );
1536
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1537
        STATUS(float_exception_flags) |= float_flag_inexact;
1538
    }
1539
    if ( aSign ) {
1540
        z = - z;
1541
    }
1542
    return z;
1543

    
1544
}
1545

    
1546
/*----------------------------------------------------------------------------
1547
| Returns the result of converting the single-precision floating-point value
1548
| `a' to the 64-bit two's complement integer format.  The conversion is
1549
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1550
| Arithmetic---which means in particular that the conversion is rounded
1551
| according to the current rounding mode.  If `a' is a NaN, the largest
1552
| positive integer is returned.  Otherwise, if the conversion overflows, the
1553
| largest integer with the same sign as `a' is returned.
1554
*----------------------------------------------------------------------------*/
1555

    
1556
int64 float32_to_int64( float32 a STATUS_PARAM )
1557
{
1558
    flag aSign;
1559
    int_fast16_t aExp, shiftCount;
1560
    uint32_t aSig;
1561
    uint64_t aSig64, aSigExtra;
1562
    a = float32_squash_input_denormal(a STATUS_VAR);
1563

    
1564
    aSig = extractFloat32Frac( a );
1565
    aExp = extractFloat32Exp( a );
1566
    aSign = extractFloat32Sign( a );
1567
    shiftCount = 0xBE - aExp;
1568
    if ( shiftCount < 0 ) {
1569
        float_raise( float_flag_invalid STATUS_VAR);
1570
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1572
        }
1573
        return (int64_t) LIT64( 0x8000000000000000 );
1574
    }
1575
    if ( aExp ) aSig |= 0x00800000;
1576
    aSig64 = aSig;
1577
    aSig64 <<= 40;
1578
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580

    
1581
}
1582

    
1583
/*----------------------------------------------------------------------------
1584
| Returns the result of converting the single-precision floating-point value
1585
| `a' to the 64-bit unsigned integer format.  The conversion is
1586
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1587
| Arithmetic---which means in particular that the conversion is rounded
1588
| according to the current rounding mode.  If `a' is a NaN, the largest
1589
| unsigned integer is returned.  Otherwise, if the conversion overflows, the
1590
| largest unsigned integer is returned.  If the 'a' is negative, the result
1591
| is rounded and zero is returned; values that do not round to zero will
1592
| raise the inexact exception flag.
1593
*----------------------------------------------------------------------------*/
1594

    
1595
uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596
{
1597
    flag aSign;
1598
    int_fast16_t aExp, shiftCount;
1599
    uint32_t aSig;
1600
    uint64_t aSig64, aSigExtra;
1601
    a = float32_squash_input_denormal(a STATUS_VAR);
1602

    
1603
    aSig = extractFloat32Frac(a);
1604
    aExp = extractFloat32Exp(a);
1605
    aSign = extractFloat32Sign(a);
1606
    if ((aSign) && (aExp > 126)) {
1607
        float_raise(float_flag_invalid STATUS_VAR);
1608
        if (float32_is_any_nan(a)) {
1609
            return LIT64(0xFFFFFFFFFFFFFFFF);
1610
        } else {
1611
            return 0;
1612
        }
1613
    }
1614
    shiftCount = 0xBE - aExp;
1615
    if (aExp) {
1616
        aSig |= 0x00800000;
1617
    }
1618
    if (shiftCount < 0) {
1619
        float_raise(float_flag_invalid STATUS_VAR);
1620
        return LIT64(0xFFFFFFFFFFFFFFFF);
1621
    }
1622

    
1623
    aSig64 = aSig;
1624
    aSig64 <<= 40;
1625
    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626
    return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627
}
1628

    
1629
/*----------------------------------------------------------------------------
1630
| Returns the result of converting the single-precision floating-point value
1631
| `a' to the 64-bit two's complement integer format.  The conversion is
1632
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1633
| Arithmetic, except that the conversion is always rounded toward zero.  If
1634
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1635
| conversion overflows, the largest integer with the same sign as `a' is
1636
| returned.
1637
*----------------------------------------------------------------------------*/
1638

    
1639
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1640
{
1641
    flag aSign;
1642
    int_fast16_t aExp, shiftCount;
1643
    uint32_t aSig;
1644
    uint64_t aSig64;
1645
    int64 z;
1646
    a = float32_squash_input_denormal(a STATUS_VAR);
1647

    
1648
    aSig = extractFloat32Frac( a );
1649
    aExp = extractFloat32Exp( a );
1650
    aSign = extractFloat32Sign( a );
1651
    shiftCount = aExp - 0xBE;
1652
    if ( 0 <= shiftCount ) {
1653
        if ( float32_val(a) != 0xDF000000 ) {
1654
            float_raise( float_flag_invalid STATUS_VAR);
1655
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1657
            }
1658
        }
1659
        return (int64_t) LIT64( 0x8000000000000000 );
1660
    }
1661
    else if ( aExp <= 0x7E ) {
1662
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1663
        return 0;
1664
    }
1665
    aSig64 = aSig | 0x00800000;
1666
    aSig64 <<= 40;
1667
    z = aSig64>>( - shiftCount );
1668
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1669
        STATUS(float_exception_flags) |= float_flag_inexact;
1670
    }
1671
    if ( aSign ) z = - z;
1672
    return z;
1673

    
1674
}
1675

    
1676
/*----------------------------------------------------------------------------
1677
| Returns the result of converting the single-precision floating-point value
1678
| `a' to the double-precision floating-point format.  The conversion is
1679
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1680
| Arithmetic.
1681
*----------------------------------------------------------------------------*/
1682

    
1683
float64 float32_to_float64( float32 a STATUS_PARAM )
1684
{
1685
    flag aSign;
1686
    int_fast16_t aExp;
1687
    uint32_t aSig;
1688
    a = float32_squash_input_denormal(a STATUS_VAR);
1689

    
1690
    aSig = extractFloat32Frac( a );
1691
    aExp = extractFloat32Exp( a );
1692
    aSign = extractFloat32Sign( a );
1693
    if ( aExp == 0xFF ) {
1694
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1695
        return packFloat64( aSign, 0x7FF, 0 );
1696
    }
1697
    if ( aExp == 0 ) {
1698
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1699
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1700
        --aExp;
1701
    }
1702
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1703

    
1704
}
1705

    
1706
/*----------------------------------------------------------------------------
1707
| Returns the result of converting the single-precision floating-point value
1708
| `a' to the extended double-precision floating-point format.  The conversion
1709
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1710
| Arithmetic.
1711
*----------------------------------------------------------------------------*/
1712

    
1713
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1714
{
1715
    flag aSign;
1716
    int_fast16_t aExp;
1717
    uint32_t aSig;
1718

    
1719
    a = float32_squash_input_denormal(a STATUS_VAR);
1720
    aSig = extractFloat32Frac( a );
1721
    aExp = extractFloat32Exp( a );
1722
    aSign = extractFloat32Sign( a );
1723
    if ( aExp == 0xFF ) {
1724
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1725
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1726
    }
1727
    if ( aExp == 0 ) {
1728
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1729
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1730
    }
1731
    aSig |= 0x00800000;
1732
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1733

    
1734
}
1735

    
1736
/*----------------------------------------------------------------------------
1737
| Returns the result of converting the single-precision floating-point value
1738
| `a' to the double-precision floating-point format.  The conversion is
1739
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1740
| Arithmetic.
1741
*----------------------------------------------------------------------------*/
1742

    
1743
float128 float32_to_float128( float32 a STATUS_PARAM )
1744
{
1745
    flag aSign;
1746
    int_fast16_t aExp;
1747
    uint32_t aSig;
1748

    
1749
    a = float32_squash_input_denormal(a STATUS_VAR);
1750
    aSig = extractFloat32Frac( a );
1751
    aExp = extractFloat32Exp( a );
1752
    aSign = extractFloat32Sign( a );
1753
    if ( aExp == 0xFF ) {
1754
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1755
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1756
    }
1757
    if ( aExp == 0 ) {
1758
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1759
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1760
        --aExp;
1761
    }
1762
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1763

    
1764
}
1765

    
1766
/*----------------------------------------------------------------------------
1767
| Rounds the single-precision floating-point value `a' to an integer, and
1768
| returns the result as a single-precision floating-point value.  The
1769
| operation is performed according to the IEC/IEEE Standard for Binary
1770
| Floating-Point Arithmetic.
1771
*----------------------------------------------------------------------------*/
1772

    
1773
float32 float32_round_to_int( float32 a STATUS_PARAM)
1774
{
1775
    flag aSign;
1776
    int_fast16_t aExp;
1777
    uint32_t lastBitMask, roundBitsMask;
1778
    uint32_t z;
1779
    a = float32_squash_input_denormal(a STATUS_VAR);
1780

    
1781
    aExp = extractFloat32Exp( a );
1782
    if ( 0x96 <= aExp ) {
1783
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1784
            return propagateFloat32NaN( a, a STATUS_VAR );
1785
        }
1786
        return a;
1787
    }
1788
    if ( aExp <= 0x7E ) {
1789
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1790
        STATUS(float_exception_flags) |= float_flag_inexact;
1791
        aSign = extractFloat32Sign( a );
1792
        switch ( STATUS(float_rounding_mode) ) {
1793
         case float_round_nearest_even:
1794
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1795
                return packFloat32( aSign, 0x7F, 0 );
1796
            }
1797
            break;
1798
        case float_round_ties_away:
1799
            if (aExp == 0x7E) {
1800
                return packFloat32(aSign, 0x7F, 0);
1801
            }
1802
            break;
1803
         case float_round_down:
1804
            return make_float32(aSign ? 0xBF800000 : 0);
1805
         case float_round_up:
1806
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1807
        }
1808
        return packFloat32( aSign, 0, 0 );
1809
    }
1810
    lastBitMask = 1;
1811
    lastBitMask <<= 0x96 - aExp;
1812
    roundBitsMask = lastBitMask - 1;
1813
    z = float32_val(a);
1814
    switch (STATUS(float_rounding_mode)) {
1815
    case float_round_nearest_even:
1816
        z += lastBitMask>>1;
1817
        if ((z & roundBitsMask) == 0) {
1818
            z &= ~lastBitMask;
1819
        }
1820
        break;
1821
    case float_round_ties_away:
1822
        z += lastBitMask >> 1;
1823
        break;
1824
    case float_round_to_zero:
1825
        break;
1826
    case float_round_up:
1827
        if (!extractFloat32Sign(make_float32(z))) {
1828
            z += roundBitsMask;
1829
        }
1830
        break;
1831
    case float_round_down:
1832
        if (extractFloat32Sign(make_float32(z))) {
1833
            z += roundBitsMask;
1834
        }
1835
        break;
1836
    default:
1837
        abort();
1838
    }
1839
    z &= ~ roundBitsMask;
1840
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1841
    return make_float32(z);
1842

    
1843
}
1844

    
1845
/*----------------------------------------------------------------------------
1846
| Returns the result of adding the absolute values of the single-precision
1847
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1848
| before being returned.  `zSign' is ignored if the result is a NaN.
1849
| The addition is performed according to the IEC/IEEE Standard for Binary
1850
| Floating-Point Arithmetic.
1851
*----------------------------------------------------------------------------*/
1852

    
1853
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1854
{
1855
    int_fast16_t aExp, bExp, zExp;
1856
    uint32_t aSig, bSig, zSig;
1857
    int_fast16_t expDiff;
1858

    
1859
    aSig = extractFloat32Frac( a );
1860
    aExp = extractFloat32Exp( a );
1861
    bSig = extractFloat32Frac( b );
1862
    bExp = extractFloat32Exp( b );
1863
    expDiff = aExp - bExp;
1864
    aSig <<= 6;
1865
    bSig <<= 6;
1866
    if ( 0 < expDiff ) {
1867
        if ( aExp == 0xFF ) {
1868
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1869
            return a;
1870
        }
1871
        if ( bExp == 0 ) {
1872
            --expDiff;
1873
        }
1874
        else {
1875
            bSig |= 0x20000000;
1876
        }
1877
        shift32RightJamming( bSig, expDiff, &bSig );
1878
        zExp = aExp;
1879
    }
1880
    else if ( expDiff < 0 ) {
1881
        if ( bExp == 0xFF ) {
1882
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1883
            return packFloat32( zSign, 0xFF, 0 );
1884
        }
1885
        if ( aExp == 0 ) {
1886
            ++expDiff;
1887
        }
1888
        else {
1889
            aSig |= 0x20000000;
1890
        }
1891
        shift32RightJamming( aSig, - expDiff, &aSig );
1892
        zExp = bExp;
1893
    }
1894
    else {
1895
        if ( aExp == 0xFF ) {
1896
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897
            return a;
1898
        }
1899
        if ( aExp == 0 ) {
1900
            if (STATUS(flush_to_zero)) {
1901
                if (aSig | bSig) {
1902
                    float_raise(float_flag_output_denormal STATUS_VAR);
1903
                }
1904
                return packFloat32(zSign, 0, 0);
1905
            }
1906
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1907
        }
1908
        zSig = 0x40000000 + aSig + bSig;
1909
        zExp = aExp;
1910
        goto roundAndPack;
1911
    }
1912
    aSig |= 0x20000000;
1913
    zSig = ( aSig + bSig )<<1;
1914
    --zExp;
1915
    if ( (int32_t) zSig < 0 ) {
1916
        zSig = aSig + bSig;
1917
        ++zExp;
1918
    }
1919
 roundAndPack:
1920
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1921

    
1922
}
1923

    
1924
/*----------------------------------------------------------------------------
1925
| Returns the result of subtracting the absolute values of the single-
1926
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1927
| difference is negated before being returned.  `zSign' is ignored if the
1928
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1929
| Standard for Binary Floating-Point Arithmetic.
1930
*----------------------------------------------------------------------------*/
1931

    
1932
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1933
{
1934
    int_fast16_t aExp, bExp, zExp;
1935
    uint32_t aSig, bSig, zSig;
1936
    int_fast16_t expDiff;
1937

    
1938
    aSig = extractFloat32Frac( a );
1939
    aExp = extractFloat32Exp( a );
1940
    bSig = extractFloat32Frac( b );
1941
    bExp = extractFloat32Exp( b );
1942
    expDiff = aExp - bExp;
1943
    aSig <<= 7;
1944
    bSig <<= 7;
1945
    if ( 0 < expDiff ) goto aExpBigger;
1946
    if ( expDiff < 0 ) goto bExpBigger;
1947
    if ( aExp == 0xFF ) {
1948
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1949
        float_raise( float_flag_invalid STATUS_VAR);
1950
        return float32_default_nan;
1951
    }
1952
    if ( aExp == 0 ) {
1953
        aExp = 1;
1954
        bExp = 1;
1955
    }
1956
    if ( bSig < aSig ) goto aBigger;
1957
    if ( aSig < bSig ) goto bBigger;
1958
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1959
 bExpBigger:
1960
    if ( bExp == 0xFF ) {
1961
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1962
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1963
    }
1964
    if ( aExp == 0 ) {
1965
        ++expDiff;
1966
    }
1967
    else {
1968
        aSig |= 0x40000000;
1969
    }
1970
    shift32RightJamming( aSig, - expDiff, &aSig );
1971
    bSig |= 0x40000000;
1972
 bBigger:
1973
    zSig = bSig - aSig;
1974
    zExp = bExp;
1975
    zSign ^= 1;
1976
    goto normalizeRoundAndPack;
1977
 aExpBigger:
1978
    if ( aExp == 0xFF ) {
1979
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980
        return a;
1981
    }
1982
    if ( bExp == 0 ) {
1983
        --expDiff;
1984
    }
1985
    else {
1986
        bSig |= 0x40000000;
1987
    }
1988
    shift32RightJamming( bSig, expDiff, &bSig );
1989
    aSig |= 0x40000000;
1990
 aBigger:
1991
    zSig = aSig - bSig;
1992
    zExp = aExp;
1993
 normalizeRoundAndPack:
1994
    --zExp;
1995
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996

    
1997
}
1998

    
1999
/*----------------------------------------------------------------------------
2000
| Returns the result of adding the single-precision floating-point values `a'
2001
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2002
| Binary Floating-Point Arithmetic.
2003
*----------------------------------------------------------------------------*/
2004

    
2005
float32 float32_add( float32 a, float32 b STATUS_PARAM )
2006
{
2007
    flag aSign, bSign;
2008
    a = float32_squash_input_denormal(a STATUS_VAR);
2009
    b = float32_squash_input_denormal(b STATUS_VAR);
2010

    
2011
    aSign = extractFloat32Sign( a );
2012
    bSign = extractFloat32Sign( b );
2013
    if ( aSign == bSign ) {
2014
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
2015
    }
2016
    else {
2017
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
2018
    }
2019

    
2020
}
2021

    
2022
/*----------------------------------------------------------------------------
2023
| Returns the result of subtracting the single-precision floating-point values
2024
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2025
| for Binary Floating-Point Arithmetic.
2026
*----------------------------------------------------------------------------*/
2027

    
2028
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2029
{
2030
    flag aSign, bSign;
2031
    a = float32_squash_input_denormal(a STATUS_VAR);
2032
    b = float32_squash_input_denormal(b STATUS_VAR);
2033

    
2034
    aSign = extractFloat32Sign( a );
2035
    bSign = extractFloat32Sign( b );
2036
    if ( aSign == bSign ) {
2037
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038
    }
2039
    else {
2040
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
2041
    }
2042

    
2043
}
2044

    
2045
/*----------------------------------------------------------------------------
2046
| Returns the result of multiplying the single-precision floating-point values
2047
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2048
| for Binary Floating-Point Arithmetic.
2049
*----------------------------------------------------------------------------*/
2050

    
2051
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2052
{
2053
    flag aSign, bSign, zSign;
2054
    int_fast16_t aExp, bExp, zExp;
2055
    uint32_t aSig, bSig;
2056
    uint64_t zSig64;
2057
    uint32_t zSig;
2058

    
2059
    a = float32_squash_input_denormal(a STATUS_VAR);
2060
    b = float32_squash_input_denormal(b STATUS_VAR);
2061

    
2062
    aSig = extractFloat32Frac( a );
2063
    aExp = extractFloat32Exp( a );
2064
    aSign = extractFloat32Sign( a );
2065
    bSig = extractFloat32Frac( b );
2066
    bExp = extractFloat32Exp( b );
2067
    bSign = extractFloat32Sign( b );
2068
    zSign = aSign ^ bSign;
2069
    if ( aExp == 0xFF ) {
2070
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2071
            return propagateFloat32NaN( a, b STATUS_VAR );
2072
        }
2073
        if ( ( bExp | bSig ) == 0 ) {
2074
            float_raise( float_flag_invalid STATUS_VAR);
2075
            return float32_default_nan;
2076
        }
2077
        return packFloat32( zSign, 0xFF, 0 );
2078
    }
2079
    if ( bExp == 0xFF ) {
2080
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2081
        if ( ( aExp | aSig ) == 0 ) {
2082
            float_raise( float_flag_invalid STATUS_VAR);
2083
            return float32_default_nan;
2084
        }
2085
        return packFloat32( zSign, 0xFF, 0 );
2086
    }
2087
    if ( aExp == 0 ) {
2088
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2089
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2090
    }
2091
    if ( bExp == 0 ) {
2092
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2093
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2094
    }
2095
    zExp = aExp + bExp - 0x7F;
2096
    aSig = ( aSig | 0x00800000 )<<7;
2097
    bSig = ( bSig | 0x00800000 )<<8;
2098
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2099
    zSig = zSig64;
2100
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2101
        zSig <<= 1;
2102
        --zExp;
2103
    }
2104
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2105

    
2106
}
2107

    
2108
/*----------------------------------------------------------------------------
2109
| Returns the result of dividing the single-precision floating-point value `a'
2110
| by the corresponding value `b'.  The operation is performed according to the
2111
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112
*----------------------------------------------------------------------------*/
2113

    
2114
float32 float32_div( float32 a, float32 b STATUS_PARAM )
2115
{
2116
    flag aSign, bSign, zSign;
2117
    int_fast16_t aExp, bExp, zExp;
2118
    uint32_t aSig, bSig, zSig;
2119
    a = float32_squash_input_denormal(a STATUS_VAR);
2120
    b = float32_squash_input_denormal(b STATUS_VAR);
2121

    
2122
    aSig = extractFloat32Frac( a );
2123
    aExp = extractFloat32Exp( a );
2124
    aSign = extractFloat32Sign( a );
2125
    bSig = extractFloat32Frac( b );
2126
    bExp = extractFloat32Exp( b );
2127
    bSign = extractFloat32Sign( b );
2128
    zSign = aSign ^ bSign;
2129
    if ( aExp == 0xFF ) {
2130
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2131
        if ( bExp == 0xFF ) {
2132
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2133
            float_raise( float_flag_invalid STATUS_VAR);
2134
            return float32_default_nan;
2135
        }
2136
        return packFloat32( zSign, 0xFF, 0 );
2137
    }
2138
    if ( bExp == 0xFF ) {
2139
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2140
        return packFloat32( zSign, 0, 0 );
2141
    }
2142
    if ( bExp == 0 ) {
2143
        if ( bSig == 0 ) {
2144
            if ( ( aExp | aSig ) == 0 ) {
2145
                float_raise( float_flag_invalid STATUS_VAR);
2146
                return float32_default_nan;
2147
            }
2148
            float_raise( float_flag_divbyzero STATUS_VAR);
2149
            return packFloat32( zSign, 0xFF, 0 );
2150
        }
2151
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2152
    }
2153
    if ( aExp == 0 ) {
2154
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2155
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2156
    }
2157
    zExp = aExp - bExp + 0x7D;
2158
    aSig = ( aSig | 0x00800000 )<<7;
2159
    bSig = ( bSig | 0x00800000 )<<8;
2160
    if ( bSig <= ( aSig + aSig ) ) {
2161
        aSig >>= 1;
2162
        ++zExp;
2163
    }
2164
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2165
    if ( ( zSig & 0x3F ) == 0 ) {
2166
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2167
    }
2168
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2169

    
2170
}
2171

    
2172
/*----------------------------------------------------------------------------
2173
| Returns the remainder of the single-precision floating-point value `a'
2174
| with respect to the corresponding value `b'.  The operation is performed
2175
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176
*----------------------------------------------------------------------------*/
2177

    
2178
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2179
{
2180
    flag aSign, zSign;
2181
    int_fast16_t aExp, bExp, expDiff;
2182
    uint32_t aSig, bSig;
2183
    uint32_t q;
2184
    uint64_t aSig64, bSig64, q64;
2185
    uint32_t alternateASig;
2186
    int32_t sigMean;
2187
    a = float32_squash_input_denormal(a STATUS_VAR);
2188
    b = float32_squash_input_denormal(b STATUS_VAR);
2189

    
2190
    aSig = extractFloat32Frac( a );
2191
    aExp = extractFloat32Exp( a );
2192
    aSign = extractFloat32Sign( a );
2193
    bSig = extractFloat32Frac( b );
2194
    bExp = extractFloat32Exp( b );
2195
    if ( aExp == 0xFF ) {
2196
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2197
            return propagateFloat32NaN( a, b STATUS_VAR );
2198
        }
2199
        float_raise( float_flag_invalid STATUS_VAR);
2200
        return float32_default_nan;
2201
    }
2202
    if ( bExp == 0xFF ) {
2203
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2204
        return a;
2205
    }
2206
    if ( bExp == 0 ) {
2207
        if ( bSig == 0 ) {
2208
            float_raise( float_flag_invalid STATUS_VAR);
2209
            return float32_default_nan;
2210
        }
2211
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2212
    }
2213
    if ( aExp == 0 ) {
2214
        if ( aSig == 0 ) return a;
2215
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2216
    }
2217
    expDiff = aExp - bExp;
2218
    aSig |= 0x00800000;
2219
    bSig |= 0x00800000;
2220
    if ( expDiff < 32 ) {
2221
        aSig <<= 8;
2222
        bSig <<= 8;
2223
        if ( expDiff < 0 ) {
2224
            if ( expDiff < -1 ) return a;
2225
            aSig >>= 1;
2226
        }
2227
        q = ( bSig <= aSig );
2228
        if ( q ) aSig -= bSig;
2229
        if ( 0 < expDiff ) {
2230
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2231
            q >>= 32 - expDiff;
2232
            bSig >>= 2;
2233
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2234
        }
2235
        else {
2236
            aSig >>= 2;
2237
            bSig >>= 2;
2238
        }
2239
    }
2240
    else {
2241
        if ( bSig <= aSig ) aSig -= bSig;
2242
        aSig64 = ( (uint64_t) aSig )<<40;
2243
        bSig64 = ( (uint64_t) bSig )<<40;
2244
        expDiff -= 64;
2245
        while ( 0 < expDiff ) {
2246
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2247
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2248
            aSig64 = - ( ( bSig * q64 )<<38 );
2249
            expDiff -= 62;
2250
        }
2251
        expDiff += 64;
2252
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2253
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2254
        q = q64>>( 64 - expDiff );
2255
        bSig <<= 6;
2256
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2257
    }
2258
    do {
2259
        alternateASig = aSig;
2260
        ++q;
2261
        aSig -= bSig;
2262
    } while ( 0 <= (int32_t) aSig );
2263
    sigMean = aSig + alternateASig;
2264
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2265
        aSig = alternateASig;
2266
    }
2267
    zSign = ( (int32_t) aSig < 0 );
2268
    if ( zSign ) aSig = - aSig;
2269
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2270

    
2271
}
2272

    
2273
/*----------------------------------------------------------------------------
2274
| Returns the result of multiplying the single-precision floating-point values
2275
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2276
| multiplication.  The operation is performed according to the IEC/IEEE
2277
| Standard for Binary Floating-Point Arithmetic 754-2008.
2278
| The flags argument allows the caller to select negation of the
2279
| addend, the intermediate product, or the final result. (The difference
2280
| between this and having the caller do a separate negation is that negating
2281
| externally will flip the sign bit on NaNs.)
2282
*----------------------------------------------------------------------------*/
2283

    
2284
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2285
{
2286
    flag aSign, bSign, cSign, zSign;
2287
    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2288
    uint32_t aSig, bSig, cSig;
2289
    flag pInf, pZero, pSign;
2290
    uint64_t pSig64, cSig64, zSig64;
2291
    uint32_t pSig;
2292
    int shiftcount;
2293
    flag signflip, infzero;
2294

    
2295
    a = float32_squash_input_denormal(a STATUS_VAR);
2296
    b = float32_squash_input_denormal(b STATUS_VAR);
2297
    c = float32_squash_input_denormal(c STATUS_VAR);
2298
    aSig = extractFloat32Frac(a);
2299
    aExp = extractFloat32Exp(a);
2300
    aSign = extractFloat32Sign(a);
2301
    bSig = extractFloat32Frac(b);
2302
    bExp = extractFloat32Exp(b);
2303
    bSign = extractFloat32Sign(b);
2304
    cSig = extractFloat32Frac(c);
2305
    cExp = extractFloat32Exp(c);
2306
    cSign = extractFloat32Sign(c);
2307

    
2308
    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2309
               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2310

    
2311
    /* It is implementation-defined whether the cases of (0,inf,qnan)
2312
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2313
     * they return if they do), so we have to hand this information
2314
     * off to the target-specific pick-a-NaN routine.
2315
     */
2316
    if (((aExp == 0xff) && aSig) ||
2317
        ((bExp == 0xff) && bSig) ||
2318
        ((cExp == 0xff) && cSig)) {
2319
        return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2320
    }
2321

    
2322
    if (infzero) {
2323
        float_raise(float_flag_invalid STATUS_VAR);
2324
        return float32_default_nan;
2325
    }
2326

    
2327
    if (flags & float_muladd_negate_c) {
2328
        cSign ^= 1;
2329
    }
2330

    
2331
    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2332

    
2333
    /* Work out the sign and type of the product */
2334
    pSign = aSign ^ bSign;
2335
    if (flags & float_muladd_negate_product) {
2336
        pSign ^= 1;
2337
    }
2338
    pInf = (aExp == 0xff) || (bExp == 0xff);
2339
    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2340

    
2341
    if (cExp == 0xff) {
2342
        if (pInf && (pSign ^ cSign)) {
2343
            /* addition of opposite-signed infinities => InvalidOperation */
2344
            float_raise(float_flag_invalid STATUS_VAR);
2345
            return float32_default_nan;
2346
        }
2347
        /* Otherwise generate an infinity of the same sign */
2348
        return packFloat32(cSign ^ signflip, 0xff, 0);
2349
    }
2350

    
2351
    if (pInf) {
2352
        return packFloat32(pSign ^ signflip, 0xff, 0);
2353
    }
2354

    
2355
    if (pZero) {
2356
        if (cExp == 0) {
2357
            if (cSig == 0) {
2358
                /* Adding two exact zeroes */
2359
                if (pSign == cSign) {
2360
                    zSign = pSign;
2361
                } else if (STATUS(float_rounding_mode) == float_round_down) {
2362
                    zSign = 1;
2363
                } else {
2364
                    zSign = 0;
2365
                }
2366
                return packFloat32(zSign ^ signflip, 0, 0);
2367
            }
2368
            /* Exact zero plus a denorm */
2369
            if (STATUS(flush_to_zero)) {
2370
                float_raise(float_flag_output_denormal STATUS_VAR);
2371
                return packFloat32(cSign ^ signflip, 0, 0);
2372
            }
2373
        }
2374
        /* Zero plus something non-zero : just return the something */
2375
        return packFloat32(cSign ^ signflip, cExp, cSig);
2376
    }
2377

    
2378
    if (aExp == 0) {
2379
        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2380
    }
2381
    if (bExp == 0) {
2382
        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2383
    }
2384

    
2385
    /* Calculate the actual result a * b + c */
2386

    
2387
    /* Multiply first; this is easy. */
2388
    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2389
     * because we want the true exponent, not the "one-less-than"
2390
     * flavour that roundAndPackFloat32() takes.
2391
     */
2392
    pExp = aExp + bExp - 0x7e;
2393
    aSig = (aSig | 0x00800000) << 7;
2394
    bSig = (bSig | 0x00800000) << 8;
2395
    pSig64 = (uint64_t)aSig * bSig;
2396
    if ((int64_t)(pSig64 << 1) >= 0) {
2397
        pSig64 <<= 1;
2398
        pExp--;
2399
    }
2400

    
2401
    zSign = pSign ^ signflip;
2402

    
2403
    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2404
     * position 62.
2405
     */
2406
    if (cExp == 0) {
2407
        if (!cSig) {
2408
            /* Throw out the special case of c being an exact zero now */
2409
            shift64RightJamming(pSig64, 32, &pSig64);
2410
            pSig = pSig64;
2411
            return roundAndPackFloat32(zSign, pExp - 1,
2412
                                       pSig STATUS_VAR);
2413
        }
2414
        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2415
    }
2416

    
2417
    cSig64 = (uint64_t)cSig << (62 - 23);
2418
    cSig64 |= LIT64(0x4000000000000000);
2419
    expDiff = pExp - cExp;
2420

    
2421
    if (pSign == cSign) {
2422
        /* Addition */
2423
        if (expDiff > 0) {
2424
            /* scale c to match p */
2425
            shift64RightJamming(cSig64, expDiff, &cSig64);
2426
            zExp = pExp;
2427
        } else if (expDiff < 0) {
2428
            /* scale p to match c */
2429
            shift64RightJamming(pSig64, -expDiff, &pSig64);
2430
            zExp = cExp;
2431
        } else {
2432
            /* no scaling needed */
2433
            zExp = cExp;
2434
        }
2435
        /* Add significands and make sure explicit bit ends up in posn 62 */
2436
        zSig64 = pSig64 + cSig64;
2437
        if ((int64_t)zSig64 < 0) {
2438
            shift64RightJamming(zSig64, 1, &zSig64);
2439
        } else {
2440
            zExp--;
2441
        }
2442
    } else {
2443
        /* Subtraction */
2444
        if (expDiff > 0) {
2445
            shift64RightJamming(cSig64, expDiff, &cSig64);
2446
            zSig64 = pSig64 - cSig64;
2447
            zExp = pExp;
2448
        } else if (expDiff < 0) {
2449
            shift64RightJamming(pSig64, -expDiff, &pSig64);
2450
            zSig64 = cSig64 - pSig64;
2451
            zExp = cExp;
2452
            zSign ^= 1;
2453
        } else {
2454
            zExp = pExp;
2455
            if (cSig64 < pSig64) {
2456
                zSig64 = pSig64 - cSig64;
2457
            } else if (pSig64 < cSig64) {
2458
                zSig64 = cSig64 - pSig64;
2459
                zSign ^= 1;
2460
            } else {
2461
                /* Exact zero */
2462
                zSign = signflip;
2463
                if (STATUS(float_rounding_mode) == float_round_down) {
2464
                    zSign ^= 1;
2465
                }
2466
                return packFloat32(zSign, 0, 0);
2467
            }
2468
        }
2469
        --zExp;
2470
        /* Normalize to put the explicit bit back into bit 62. */
2471
        shiftcount = countLeadingZeros64(zSig64) - 1;
2472
        zSig64 <<= shiftcount;
2473
        zExp -= shiftcount;
2474
    }
2475
    shift64RightJamming(zSig64, 32, &zSig64);
2476
    return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2477
}
2478

    
2479

    
2480
/*----------------------------------------------------------------------------
2481
| Returns the square root of the single-precision floating-point value `a'.
2482
| The operation is performed according to the IEC/IEEE Standard for Binary
2483
| Floating-Point Arithmetic.
2484
*----------------------------------------------------------------------------*/
2485

    
2486
float32 float32_sqrt( float32 a STATUS_PARAM )
2487
{
2488
    flag aSign;
2489
    int_fast16_t aExp, zExp;
2490
    uint32_t aSig, zSig;
2491
    uint64_t rem, term;
2492
    a = float32_squash_input_denormal(a STATUS_VAR);
2493

    
2494
    aSig = extractFloat32Frac( a );
2495
    aExp = extractFloat32Exp( a );
2496
    aSign = extractFloat32Sign( a );
2497
    if ( aExp == 0xFF ) {
2498
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2499
        if ( ! aSign ) return a;
2500
        float_raise( float_flag_invalid STATUS_VAR);
2501
        return float32_default_nan;
2502
    }
2503
    if ( aSign ) {
2504
        if ( ( aExp | aSig ) == 0 ) return a;
2505
        float_raise( float_flag_invalid STATUS_VAR);
2506
        return float32_default_nan;
2507
    }
2508
    if ( aExp == 0 ) {
2509
        if ( aSig == 0 ) return float32_zero;
2510
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2511
    }
2512
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2513
    aSig = ( aSig | 0x00800000 )<<8;
2514
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2515
    if ( ( zSig & 0x7F ) <= 5 ) {
2516
        if ( zSig < 2 ) {
2517
            zSig = 0x7FFFFFFF;
2518
            goto roundAndPack;
2519
        }
2520
        aSig >>= aExp & 1;
2521
        term = ( (uint64_t) zSig ) * zSig;
2522
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2523
        while ( (int64_t) rem < 0 ) {
2524
            --zSig;
2525
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2526
        }
2527
        zSig |= ( rem != 0 );
2528
    }
2529
    shift32RightJamming( zSig, 1, &zSig );
2530
 roundAndPack:
2531
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2532

    
2533
}
2534

    
2535
/*----------------------------------------------------------------------------
2536
| Returns the binary exponential of the single-precision floating-point value
2537
| `a'. The operation is performed according to the IEC/IEEE Standard for
2538
| Binary Floating-Point Arithmetic.
2539
|
2540
| Uses the following identities:
2541
|
2542
| 1. -------------------------------------------------------------------------
2543
|      x    x*ln(2)
2544
|     2  = e
2545
|
2546
| 2. -------------------------------------------------------------------------
2547
|                      2     3     4     5           n
2548
|      x        x     x     x     x     x           x
2549
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2550
|               1!    2!    3!    4!    5!          n!
2551
*----------------------------------------------------------------------------*/
2552

    
2553
static const float64 float32_exp2_coefficients[15] =
2554
{
2555
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2556
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2557
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2558
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2559
    const_float64( 0x3f81111111111111ll ), /*  5 */
2560
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2561
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2562
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2563
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2564
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2565
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2566
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2567
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2568
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2569
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2570
};
2571

    
2572
float32 float32_exp2( float32 a STATUS_PARAM )
2573
{
2574
    flag aSign;
2575
    int_fast16_t aExp;
2576
    uint32_t aSig;
2577
    float64 r, x, xn;
2578
    int i;
2579
    a = float32_squash_input_denormal(a STATUS_VAR);
2580

    
2581
    aSig = extractFloat32Frac( a );
2582
    aExp = extractFloat32Exp( a );
2583
    aSign = extractFloat32Sign( a );
2584

    
2585
    if ( aExp == 0xFF) {
2586
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2587
        return (aSign) ? float32_zero : a;
2588
    }
2589
    if (aExp == 0) {
2590
        if (aSig == 0) return float32_one;
2591
    }
2592

    
2593
    float_raise( float_flag_inexact STATUS_VAR);
2594

    
2595
    /* ******************************* */
2596
    /* using float64 for approximation */
2597
    /* ******************************* */
2598
    x = float32_to_float64(a STATUS_VAR);
2599
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2600

    
2601
    xn = x;
2602
    r = float64_one;
2603
    for (i = 0 ; i < 15 ; i++) {
2604
        float64 f;
2605

    
2606
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2607
        r = float64_add(r, f STATUS_VAR);
2608

    
2609
        xn = float64_mul(xn, x STATUS_VAR);
2610
    }
2611

    
2612
    return float64_to_float32(r, status);
2613
}
2614

    
2615
/*----------------------------------------------------------------------------
2616
| Returns the binary log of the single-precision floating-point value `a'.
2617
| The operation is performed according to the IEC/IEEE Standard for Binary
2618
| Floating-Point Arithmetic.
2619
*----------------------------------------------------------------------------*/
2620
float32 float32_log2( float32 a STATUS_PARAM )
2621
{
2622
    flag aSign, zSign;
2623
    int_fast16_t aExp;
2624
    uint32_t aSig, zSig, i;
2625

    
2626
    a = float32_squash_input_denormal(a STATUS_VAR);
2627
    aSig = extractFloat32Frac( a );
2628
    aExp = extractFloat32Exp( a );
2629
    aSign = extractFloat32Sign( a );
2630

    
2631
    if ( aExp == 0 ) {
2632
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2633
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2634
    }
2635
    if ( aSign ) {
2636
        float_raise( float_flag_invalid STATUS_VAR);
2637
        return float32_default_nan;
2638
    }
2639
    if ( aExp == 0xFF ) {
2640
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2641
        return a;
2642
    }
2643

    
2644
    aExp -= 0x7F;
2645
    aSig |= 0x00800000;
2646
    zSign = aExp < 0;
2647
    zSig = aExp << 23;
2648

    
2649
    for (i = 1 << 22; i > 0; i >>= 1) {
2650
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2651
        if ( aSig & 0x01000000 ) {
2652
            aSig >>= 1;
2653
            zSig |= i;
2654
        }
2655
    }
2656

    
2657
    if ( zSign )
2658
        zSig = -zSig;
2659

    
2660
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2661
}
2662

    
2663
/*----------------------------------------------------------------------------
2664
| Returns 1 if the single-precision floating-point value `a' is equal to
2665
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2666
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2667
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2668
*----------------------------------------------------------------------------*/
2669

    
2670
int float32_eq( float32 a, float32 b STATUS_PARAM )
2671
{
2672
    uint32_t av, bv;
2673
    a = float32_squash_input_denormal(a STATUS_VAR);
2674
    b = float32_squash_input_denormal(b STATUS_VAR);
2675

    
2676
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2677
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2678
       ) {
2679
        float_raise( float_flag_invalid STATUS_VAR);
2680
        return 0;
2681
    }
2682
    av = float32_val(a);
2683
    bv = float32_val(b);
2684
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2685
}
2686

    
2687
/*----------------------------------------------------------------------------
2688
| Returns 1 if the single-precision floating-point value `a' is less than
2689
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2690
| exception is raised if either operand is a NaN.  The comparison is performed
2691
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2692
*----------------------------------------------------------------------------*/
2693

    
2694
int float32_le( float32 a, float32 b STATUS_PARAM )
2695
{
2696
    flag aSign, bSign;
2697
    uint32_t av, bv;
2698
    a = float32_squash_input_denormal(a STATUS_VAR);
2699
    b = float32_squash_input_denormal(b STATUS_VAR);
2700

    
2701
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2702
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2703
       ) {
2704
        float_raise( float_flag_invalid STATUS_VAR);
2705
        return 0;
2706
    }
2707
    aSign = extractFloat32Sign( a );
2708
    bSign = extractFloat32Sign( b );
2709
    av = float32_val(a);
2710
    bv = float32_val(b);
2711
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2712
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2713

    
2714
}
2715

    
2716
/*----------------------------------------------------------------------------
2717
| Returns 1 if the single-precision floating-point value `a' is less than
2718
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2719
| raised if either operand is a NaN.  The comparison is performed according
2720
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2721
*----------------------------------------------------------------------------*/
2722

    
2723
int float32_lt( float32 a, float32 b STATUS_PARAM )
2724
{
2725
    flag aSign, bSign;
2726
    uint32_t av, bv;
2727
    a = float32_squash_input_denormal(a STATUS_VAR);
2728
    b = float32_squash_input_denormal(b STATUS_VAR);
2729

    
2730
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2731
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2732
       ) {
2733
        float_raise( float_flag_invalid STATUS_VAR);
2734
        return 0;
2735
    }
2736
    aSign = extractFloat32Sign( a );
2737
    bSign = extractFloat32Sign( b );
2738
    av = float32_val(a);
2739
    bv = float32_val(b);
2740
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2741
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2742

    
2743
}
2744

    
2745
/*----------------------------------------------------------------------------
2746
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2747
| be compared, and 0 otherwise.  The invalid exception is raised if either
2748
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2749
| Standard for Binary Floating-Point Arithmetic.
2750
*----------------------------------------------------------------------------*/
2751

    
2752
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2753
{
2754
    a = float32_squash_input_denormal(a STATUS_VAR);
2755
    b = float32_squash_input_denormal(b STATUS_VAR);
2756

    
2757
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2758
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2759
       ) {
2760
        float_raise( float_flag_invalid STATUS_VAR);
2761
        return 1;
2762
    }
2763
    return 0;
2764
}
2765

    
2766
/*----------------------------------------------------------------------------
2767
| Returns 1 if the single-precision floating-point value `a' is equal to
2768
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2769
| exception.  The comparison is performed according to the IEC/IEEE Standard
2770
| for Binary Floating-Point Arithmetic.
2771
*----------------------------------------------------------------------------*/
2772

    
2773
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2774
{
2775
    a = float32_squash_input_denormal(a STATUS_VAR);
2776
    b = float32_squash_input_denormal(b STATUS_VAR);
2777

    
2778
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2779
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2780
       ) {
2781
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2782
            float_raise( float_flag_invalid STATUS_VAR);
2783
        }
2784
        return 0;
2785
    }
2786
    return ( float32_val(a) == float32_val(b) ) ||
2787
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2788
}
2789

    
2790
/*----------------------------------------------------------------------------
2791
| Returns 1 if the single-precision floating-point value `a' is less than or
2792
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2793
| cause an exception.  Otherwise, the comparison is performed according to the
2794
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2795
*----------------------------------------------------------------------------*/
2796

    
2797
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2798
{
2799
    flag aSign, bSign;
2800
    uint32_t av, bv;
2801
    a = float32_squash_input_denormal(a STATUS_VAR);
2802
    b = float32_squash_input_denormal(b STATUS_VAR);
2803

    
2804
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2805
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2806
       ) {
2807
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2808
            float_raise( float_flag_invalid STATUS_VAR);
2809
        }
2810
        return 0;
2811
    }
2812
    aSign = extractFloat32Sign( a );
2813
    bSign = extractFloat32Sign( b );
2814
    av = float32_val(a);
2815
    bv = float32_val(b);
2816
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2817
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2818

    
2819
}
2820

    
2821
/*----------------------------------------------------------------------------
2822
| Returns 1 if the single-precision floating-point value `a' is less than
2823
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2824
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2825
| Standard for Binary Floating-Point Arithmetic.
2826
*----------------------------------------------------------------------------*/
2827

    
2828
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2829
{
2830
    flag aSign, bSign;
2831
    uint32_t av, bv;
2832
    a = float32_squash_input_denormal(a STATUS_VAR);
2833
    b = float32_squash_input_denormal(b STATUS_VAR);
2834

    
2835
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2836
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2837
       ) {
2838
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2839
            float_raise( float_flag_invalid STATUS_VAR);
2840
        }
2841
        return 0;
2842
    }
2843
    aSign = extractFloat32Sign( a );
2844
    bSign = extractFloat32Sign( b );
2845
    av = float32_val(a);
2846
    bv = float32_val(b);
2847
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2848
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2849

    
2850
}
2851

    
2852
/*----------------------------------------------------------------------------
2853
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2854
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2855
| comparison is performed according to the IEC/IEEE Standard for Binary
2856
| Floating-Point Arithmetic.
2857
*----------------------------------------------------------------------------*/
2858

    
2859
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2860
{
2861
    a = float32_squash_input_denormal(a STATUS_VAR);
2862
    b = float32_squash_input_denormal(b STATUS_VAR);
2863

    
2864
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2865
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2866
       ) {
2867
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2868
            float_raise( float_flag_invalid STATUS_VAR);
2869
        }
2870
        return 1;
2871
    }
2872
    return 0;
2873
}
2874

    
2875
/*----------------------------------------------------------------------------
2876
| Returns the result of converting the double-precision floating-point value
2877
| `a' to the 32-bit two's complement integer format.  The conversion is
2878
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2879
| Arithmetic---which means in particular that the conversion is rounded
2880
| according to the current rounding mode.  If `a' is a NaN, the largest
2881
| positive integer is returned.  Otherwise, if the conversion overflows, the
2882
| largest integer with the same sign as `a' is returned.
2883
*----------------------------------------------------------------------------*/
2884

    
2885
int32 float64_to_int32( float64 a STATUS_PARAM )
2886
{
2887
    flag aSign;
2888
    int_fast16_t aExp, shiftCount;
2889
    uint64_t aSig;
2890
    a = float64_squash_input_denormal(a STATUS_VAR);
2891

    
2892
    aSig = extractFloat64Frac( a );
2893
    aExp = extractFloat64Exp( a );
2894
    aSign = extractFloat64Sign( a );
2895
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2896
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2897
    shiftCount = 0x42C - aExp;
2898
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2899
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2900

    
2901
}
2902

    
2903
/*----------------------------------------------------------------------------
2904
| Returns the result of converting the double-precision floating-point value
2905
| `a' to the 32-bit two's complement integer format.  The conversion is
2906
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2907
| Arithmetic, except that the conversion is always rounded toward zero.
2908
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2909
| the conversion overflows, the largest integer with the same sign as `a' is
2910
| returned.
2911
*----------------------------------------------------------------------------*/
2912

    
2913
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2914
{
2915
    flag aSign;
2916
    int_fast16_t aExp, shiftCount;
2917
    uint64_t aSig, savedASig;
2918
    int32_t z;
2919
    a = float64_squash_input_denormal(a STATUS_VAR);
2920

    
2921
    aSig = extractFloat64Frac( a );
2922
    aExp = extractFloat64Exp( a );
2923
    aSign = extractFloat64Sign( a );
2924
    if ( 0x41E < aExp ) {
2925
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2926
        goto invalid;
2927
    }
2928
    else if ( aExp < 0x3FF ) {
2929
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2930
        return 0;
2931
    }
2932
    aSig |= LIT64( 0x0010000000000000 );
2933
    shiftCount = 0x433 - aExp;
2934
    savedASig = aSig;
2935
    aSig >>= shiftCount;
2936
    z = aSig;
2937
    if ( aSign ) z = - z;
2938
    if ( ( z < 0 ) ^ aSign ) {
2939
 invalid:
2940
        float_raise( float_flag_invalid STATUS_VAR);
2941
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2942
    }
2943
    if ( ( aSig<<shiftCount ) != savedASig ) {
2944
        STATUS(float_exception_flags) |= float_flag_inexact;
2945
    }
2946
    return z;
2947

    
2948
}
2949

    
2950
/*----------------------------------------------------------------------------
2951
| Returns the result of converting the double-precision floating-point value
2952
| `a' to the 16-bit two's complement integer format.  The conversion is
2953
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2954
| Arithmetic, except that the conversion is always rounded toward zero.
2955
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2956
| the conversion overflows, the largest integer with the same sign as `a' is
2957
| returned.
2958
*----------------------------------------------------------------------------*/
2959

    
2960
int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2961
{
2962
    flag aSign;
2963
    int_fast16_t aExp, shiftCount;
2964
    uint64_t aSig, savedASig;
2965
    int32 z;
2966

    
2967
    aSig = extractFloat64Frac( a );
2968
    aExp = extractFloat64Exp( a );
2969
    aSign = extractFloat64Sign( a );
2970
    if ( 0x40E < aExp ) {
2971
        if ( ( aExp == 0x7FF ) && aSig ) {
2972
            aSign = 0;
2973
        }
2974
        goto invalid;
2975
    }
2976
    else if ( aExp < 0x3FF ) {
2977
        if ( aExp || aSig ) {
2978
            STATUS(float_exception_flags) |= float_flag_inexact;
2979
        }
2980
        return 0;
2981
    }
2982
    aSig |= LIT64( 0x0010000000000000 );
2983
    shiftCount = 0x433 - aExp;
2984
    savedASig = aSig;
2985
    aSig >>= shiftCount;
2986
    z = aSig;
2987
    if ( aSign ) {
2988
        z = - z;
2989
    }
2990
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2991
 invalid:
2992
        float_raise( float_flag_invalid STATUS_VAR);
2993
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2994
    }
2995
    if ( ( aSig<<shiftCount ) != savedASig ) {
2996
        STATUS(float_exception_flags) |= float_flag_inexact;
2997
    }
2998
    return z;
2999
}
3000

    
3001
/*----------------------------------------------------------------------------
3002
| Returns the result of converting the double-precision floating-point value
3003
| `a' to the 64-bit two's complement integer format.  The conversion is
3004
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3005
| Arithmetic---which means in particular that the conversion is rounded
3006
| according to the current rounding mode.  If `a' is a NaN, the largest
3007
| positive integer is returned.  Otherwise, if the conversion overflows, the
3008
| largest integer with the same sign as `a' is returned.
3009
*----------------------------------------------------------------------------*/
3010

    
3011
int64 float64_to_int64( float64 a STATUS_PARAM )
3012
{
3013
    flag aSign;
3014
    int_fast16_t aExp, shiftCount;
3015
    uint64_t aSig, aSigExtra;
3016
    a = float64_squash_input_denormal(a STATUS_VAR);
3017

    
3018
    aSig = extractFloat64Frac( a );
3019
    aExp = extractFloat64Exp( a );
3020
    aSign = extractFloat64Sign( a );
3021
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3022
    shiftCount = 0x433 - aExp;
3023
    if ( shiftCount <= 0 ) {
3024
        if ( 0x43E < aExp ) {
3025
            float_raise( float_flag_invalid STATUS_VAR);
3026
            if (    ! aSign
3027
                 || (    ( aExp == 0x7FF )
3028
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
3029
               ) {
3030
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3031
            }
3032
            return (int64_t) LIT64( 0x8000000000000000 );
3033
        }
3034
        aSigExtra = 0;
3035
        aSig <<= - shiftCount;
3036
    }
3037
    else {
3038
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3039
    }
3040
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3041

    
3042
}
3043

    
3044
/*----------------------------------------------------------------------------
3045
| Returns the result of converting the double-precision floating-point value
3046
| `a' to the 64-bit two's complement integer format.  The conversion is
3047
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3048
| Arithmetic, except that the conversion is always rounded toward zero.
3049
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3050
| the conversion overflows, the largest integer with the same sign as `a' is
3051
| returned.
3052
*----------------------------------------------------------------------------*/
3053

    
3054
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3055
{
3056
    flag aSign;
3057
    int_fast16_t aExp, shiftCount;
3058
    uint64_t aSig;
3059
    int64 z;
3060
    a = float64_squash_input_denormal(a STATUS_VAR);
3061

    
3062
    aSig = extractFloat64Frac( a );
3063
    aExp = extractFloat64Exp( a );
3064
    aSign = extractFloat64Sign( a );
3065
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3066
    shiftCount = aExp - 0x433;
3067
    if ( 0 <= shiftCount ) {
3068
        if ( 0x43E <= aExp ) {
3069
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3070
                float_raise( float_flag_invalid STATUS_VAR);
3071
                if (    ! aSign
3072
                     || (    ( aExp == 0x7FF )
3073
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
3074
                   ) {
3075
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
3076
                }
3077
            }
3078
            return (int64_t) LIT64( 0x8000000000000000 );
3079
        }
3080
        z = aSig<<shiftCount;
3081
    }
3082
    else {
3083
        if ( aExp < 0x3FE ) {
3084
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3085
            return 0;
3086
        }
3087
        z = aSig>>( - shiftCount );
3088
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3089
            STATUS(float_exception_flags) |= float_flag_inexact;
3090
        }
3091
    }
3092
    if ( aSign ) z = - z;
3093
    return z;
3094

    
3095
}
3096

    
3097
/*----------------------------------------------------------------------------
3098
| Returns the result of converting the double-precision floating-point value
3099
| `a' to the single-precision floating-point format.  The conversion is
3100
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3101
| Arithmetic.
3102
*----------------------------------------------------------------------------*/
3103

    
3104
float32 float64_to_float32( float64 a STATUS_PARAM )
3105
{
3106
    flag aSign;
3107
    int_fast16_t aExp;
3108
    uint64_t aSig;
3109
    uint32_t zSig;
3110
    a = float64_squash_input_denormal(a STATUS_VAR);
3111

    
3112
    aSig = extractFloat64Frac( a );
3113
    aExp = extractFloat64Exp( a );
3114
    aSign = extractFloat64Sign( a );
3115
    if ( aExp == 0x7FF ) {
3116
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3117
        return packFloat32( aSign, 0xFF, 0 );
3118
    }
3119
    shift64RightJamming( aSig, 22, &aSig );
3120
    zSig = aSig;
3121
    if ( aExp || zSig ) {
3122
        zSig |= 0x40000000;
3123
        aExp -= 0x381;
3124
    }
3125
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3126

    
3127
}
3128

    
3129

    
3130
/*----------------------------------------------------------------------------
3131
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3132
| half-precision floating-point value, returning the result.  After being
3133
| shifted into the proper positions, the three fields are simply added
3134
| together to form the result.  This means that any integer portion of `zSig'
3135
| will be added into the exponent.  Since a properly normalized significand
3136
| will have an integer portion equal to 1, the `zExp' input should be 1 less
3137
| than the desired result exponent whenever `zSig' is a complete, normalized
3138
| significand.
3139
*----------------------------------------------------------------------------*/
3140
static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3141
{
3142
    return make_float16(
3143
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3144
}
3145

    
3146
/*----------------------------------------------------------------------------
3147
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3148
| and significand `zSig', and returns the proper half-precision floating-
3149
| point value corresponding to the abstract input.  Ordinarily, the abstract
3150
| value is simply rounded and packed into the half-precision format, with
3151
| the inexact exception raised if the abstract input cannot be represented
3152
| exactly.  However, if the abstract value is too large, the overflow and
3153
| inexact exceptions are raised and an infinity or maximal finite value is
3154
| returned.  If the abstract value is too small, the input value is rounded to
3155
| a subnormal number, and the underflow and inexact exceptions are raised if
3156
| the abstract input cannot be represented exactly as a subnormal half-
3157
| precision floating-point number.
3158
| The `ieee' flag indicates whether to use IEEE standard half precision, or
3159
| ARM-style "alternative representation", which omits the NaN and Inf
3160
| encodings in order to raise the maximum representable exponent by one.
3161
|     The input significand `zSig' has its binary point between bits 22
3162
| and 23, which is 13 bits to the left of the usual location.  This shifted
3163
| significand must be normalized or smaller.  If `zSig' is not normalized,
3164
| `zExp' must be 0; in that case, the result returned is a subnormal number,
3165
| and it must not require rounding.  In the usual case that `zSig' is
3166
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3167
| Note the slightly odd position of the binary point in zSig compared with the
3168
| other roundAndPackFloat functions. This should probably be fixed if we
3169
| need to implement more float16 routines than just conversion.
3170
| The handling of underflow and overflow follows the IEC/IEEE Standard for
3171
| Binary Floating-Point Arithmetic.
3172
*----------------------------------------------------------------------------*/
3173

    
3174
static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3175
                                   uint32_t zSig, flag ieee STATUS_PARAM)
3176
{
3177
    int maxexp = ieee ? 29 : 30;
3178
    uint32_t mask;
3179
    uint32_t increment;
3180
    bool rounding_bumps_exp;
3181
    bool is_tiny = false;
3182

    
3183
    /* Calculate the mask of bits of the mantissa which are not
3184
     * representable in half-precision and will be lost.
3185
     */
3186
    if (zExp < 1) {
3187
        /* Will be denormal in halfprec */
3188
        mask = 0x00ffffff;
3189
        if (zExp >= -11) {
3190
            mask >>= 11 + zExp;
3191
        }
3192
    } else {
3193
        /* Normal number in halfprec */
3194
        mask = 0x00001fff;
3195
    }
3196

    
3197
    switch (STATUS(float_rounding_mode)) {
3198
    case float_round_nearest_even:
3199
        increment = (mask + 1) >> 1;
3200
        if ((zSig & mask) == increment) {
3201
            increment = zSig & (increment << 1);
3202
        }
3203
        break;
3204
    case float_round_ties_away:
3205
        increment = (mask + 1) >> 1;
3206
        break;
3207
    case float_round_up:
3208
        increment = zSign ? 0 : mask;
3209
        break;
3210
    case float_round_down:
3211
        increment = zSign ? mask : 0;
3212
        break;
3213
    default: /* round_to_zero */
3214
        increment = 0;
3215
        break;
3216
    }
3217

    
3218
    rounding_bumps_exp = (zSig + increment >= 0x01000000);
3219

    
3220
    if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3221
        if (ieee) {
3222
            float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3223
            return packFloat16(zSign, 0x1f, 0);
3224
        } else {
3225
            float_raise(float_flag_invalid STATUS_VAR);
3226
            return packFloat16(zSign, 0x1f, 0x3ff);
3227
        }
3228
    }
3229

    
3230
    if (zExp < 0) {
3231
        /* Note that flush-to-zero does not affect half-precision results */
3232
        is_tiny =
3233
            (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3234
            || (zExp < -1)
3235
            || (!rounding_bumps_exp);
3236
    }
3237
    if (zSig & mask) {
3238
        float_raise(float_flag_inexact STATUS_VAR);
3239
        if (is_tiny) {
3240
            float_raise(float_flag_underflow STATUS_VAR);
3241
        }
3242
    }
3243

    
3244
    zSig += increment;
3245
    if (rounding_bumps_exp) {
3246
        zSig >>= 1;
3247
        zExp++;
3248
    }
3249

    
3250
    if (zExp < -10) {
3251
        return packFloat16(zSign, 0, 0);
3252
    }
3253
    if (zExp < 0) {
3254
        zSig >>= -zExp;
3255
        zExp = 0;
3256
    }
3257
    return packFloat16(zSign, zExp, zSig >> 13);
3258
}
3259

    
3260
static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3261
                                      uint32_t *zSigPtr)
3262
{
3263
    int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3264
    *zSigPtr = aSig << shiftCount;
3265
    *zExpPtr = 1 - shiftCount;
3266
}
3267

    
3268
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3269
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3270

    
3271
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3272
{
3273
    flag aSign;
3274
    int_fast16_t aExp;
3275
    uint32_t aSig;
3276

    
3277
    aSign = extractFloat16Sign(a);
3278
    aExp = extractFloat16Exp(a);
3279
    aSig = extractFloat16Frac(a);
3280

    
3281
    if (aExp == 0x1f && ieee) {
3282
        if (aSig) {
3283
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3284
        }
3285
        return packFloat32(aSign, 0xff, 0);
3286
    }
3287
    if (aExp == 0) {
3288
        if (aSig == 0) {
3289
            return packFloat32(aSign, 0, 0);
3290
        }
3291

    
3292
        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3293
        aExp--;
3294
    }
3295
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3296
}
3297

    
3298
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3299
{
3300
    flag aSign;
3301
    int_fast16_t aExp;
3302
    uint32_t aSig;
3303

    
3304
    a = float32_squash_input_denormal(a STATUS_VAR);
3305

    
3306
    aSig = extractFloat32Frac( a );
3307
    aExp = extractFloat32Exp( a );
3308
    aSign = extractFloat32Sign( a );
3309
    if ( aExp == 0xFF ) {
3310
        if (aSig) {
3311
            /* Input is a NaN */
3312
            if (!ieee) {
3313
                float_raise(float_flag_invalid STATUS_VAR);
3314
                return packFloat16(aSign, 0, 0);
3315
            }
3316
            return commonNaNToFloat16(
3317
                float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3318
        }
3319
        /* Infinity */
3320
        if (!ieee) {
3321
            float_raise(float_flag_invalid STATUS_VAR);
3322
            return packFloat16(aSign, 0x1f, 0x3ff);
3323
        }
3324
        return packFloat16(aSign, 0x1f, 0);
3325
    }
3326
    if (aExp == 0 && aSig == 0) {
3327
        return packFloat16(aSign, 0, 0);
3328
    }
3329
    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3330
     * even if the input is denormal; however this is harmless because
3331
     * the largest possible single-precision denormal is still smaller
3332
     * than the smallest representable half-precision denormal, and so we
3333
     * will end up ignoring aSig and returning via the "always return zero"
3334
     * codepath.
3335
     */
3336
    aSig |= 0x00800000;
3337
    aExp -= 0x71;
3338

    
3339
    return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3340
}
3341

    
3342
float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3343
{
3344
    flag aSign;
3345
    int_fast16_t aExp;
3346
    uint32_t aSig;
3347

    
3348
    aSign = extractFloat16Sign(a);
3349
    aExp = extractFloat16Exp(a);
3350
    aSig = extractFloat16Frac(a);
3351

    
3352
    if (aExp == 0x1f && ieee) {
3353
        if (aSig) {
3354
            return commonNaNToFloat64(
3355
                float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3356
        }
3357
        return packFloat64(aSign, 0x7ff, 0);
3358
    }
3359
    if (aExp == 0) {
3360
        if (aSig == 0) {
3361
            return packFloat64(aSign, 0, 0);
3362
        }
3363

    
3364
        normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3365
        aExp--;
3366
    }
3367
    return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3368
}
3369

    
3370
float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3371
{
3372
    flag aSign;
3373
    int_fast16_t aExp;
3374
    uint64_t aSig;
3375
    uint32_t zSig;
3376

    
3377
    a = float64_squash_input_denormal(a STATUS_VAR);
3378

    
3379
    aSig = extractFloat64Frac(a);
3380
    aExp = extractFloat64Exp(a);
3381
    aSign = extractFloat64Sign(a);
3382
    if (aExp == 0x7FF) {
3383
        if (aSig) {
3384
            /* Input is a NaN */
3385
            if (!ieee) {
3386
                float_raise(float_flag_invalid STATUS_VAR);
3387
                return packFloat16(aSign, 0, 0);
3388
            }
3389
            return commonNaNToFloat16(
3390
                float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3391
        }
3392
        /* Infinity */
3393
        if (!ieee) {
3394
            float_raise(float_flag_invalid STATUS_VAR);
3395
            return packFloat16(aSign, 0x1f, 0x3ff);
3396
        }
3397
        return packFloat16(aSign, 0x1f, 0);
3398
    }
3399
    shift64RightJamming(aSig, 29, &aSig);
3400
    zSig = aSig;
3401
    if (aExp == 0 && zSig == 0) {
3402
        return packFloat16(aSign, 0, 0);
3403
    }
3404
    /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3405
     * even if the input is denormal; however this is harmless because
3406
     * the largest possible single-precision denormal is still smaller
3407
     * than the smallest representable half-precision denormal, and so we
3408
     * will end up ignoring aSig and returning via the "always return zero"
3409
     * codepath.
3410
     */
3411
    zSig |= 0x00800000;
3412
    aExp -= 0x3F1;
3413

    
3414
    return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3415
}
3416

    
3417
/*----------------------------------------------------------------------------
3418
| Returns the result of converting the double-precision floating-point value
3419
| `a' to the extended double-precision floating-point format.  The conversion
3420
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3421
| Arithmetic.
3422
*----------------------------------------------------------------------------*/
3423

    
3424
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3425
{
3426
    flag aSign;
3427
    int_fast16_t aExp;
3428
    uint64_t aSig;
3429

    
3430
    a = float64_squash_input_denormal(a STATUS_VAR);
3431
    aSig = extractFloat64Frac( a );
3432
    aExp = extractFloat64Exp( a );
3433
    aSign = extractFloat64Sign( a );
3434
    if ( aExp == 0x7FF ) {
3435
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3436
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3437
    }
3438
    if ( aExp == 0 ) {
3439
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3440
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3441
    }
3442
    return
3443
        packFloatx80(
3444
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3445

    
3446
}
3447

    
3448
/*----------------------------------------------------------------------------
3449
| Returns the result of converting the double-precision floating-point value
3450
| `a' to the quadruple-precision floating-point format.  The conversion is
3451
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3452
| Arithmetic.
3453
*----------------------------------------------------------------------------*/
3454

    
3455
float128 float64_to_float128( float64 a STATUS_PARAM )
3456
{
3457
    flag aSign;
3458
    int_fast16_t aExp;
3459
    uint64_t aSig, zSig0, zSig1;
3460

    
3461
    a = float64_squash_input_denormal(a STATUS_VAR);
3462
    aSig = extractFloat64Frac( a );
3463
    aExp = extractFloat64Exp( a );
3464
    aSign = extractFloat64Sign( a );
3465
    if ( aExp == 0x7FF ) {
3466
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3467
        return packFloat128( aSign, 0x7FFF, 0, 0 );
3468
    }
3469
    if ( aExp == 0 ) {
3470
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3471
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3472
        --aExp;
3473
    }
3474
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3475
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3476

    
3477
}
3478

    
3479
/*----------------------------------------------------------------------------
3480
| Rounds the double-precision floating-point value `a' to an integer, and
3481
| returns the result as a double-precision floating-point value.  The
3482
| operation is performed according to the IEC/IEEE Standard for Binary
3483
| Floating-Point Arithmetic.
3484
*----------------------------------------------------------------------------*/
3485

    
3486
float64 float64_round_to_int( float64 a STATUS_PARAM )
3487
{
3488
    flag aSign;
3489
    int_fast16_t aExp;
3490
    uint64_t lastBitMask, roundBitsMask;
3491
    uint64_t z;
3492
    a = float64_squash_input_denormal(a STATUS_VAR);
3493

    
3494
    aExp = extractFloat64Exp( a );
3495
    if ( 0x433 <= aExp ) {
3496
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3497
            return propagateFloat64NaN( a, a STATUS_VAR );
3498
        }
3499
        return a;
3500
    }
3501
    if ( aExp < 0x3FF ) {
3502
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3503
        STATUS(float_exception_flags) |= float_flag_inexact;
3504
        aSign = extractFloat64Sign( a );
3505
        switch ( STATUS(float_rounding_mode) ) {
3506
         case float_round_nearest_even:
3507
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3508
                return packFloat64( aSign, 0x3FF, 0 );
3509
            }
3510
            break;
3511
        case float_round_ties_away:
3512
            if (aExp == 0x3FE) {
3513
                return packFloat64(aSign, 0x3ff, 0);
3514
            }
3515
            break;
3516
         case float_round_down:
3517
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3518
         case float_round_up:
3519
            return make_float64(
3520
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3521
        }
3522
        return packFloat64( aSign, 0, 0 );
3523
    }
3524
    lastBitMask = 1;
3525
    lastBitMask <<= 0x433 - aExp;
3526
    roundBitsMask = lastBitMask - 1;
3527
    z = float64_val(a);
3528
    switch (STATUS(float_rounding_mode)) {
3529
    case float_round_nearest_even:
3530
        z += lastBitMask >> 1;
3531
        if ((z & roundBitsMask) == 0) {
3532
            z &= ~lastBitMask;
3533
        }
3534
        break;
3535
    case float_round_ties_away:
3536
        z += lastBitMask >> 1;
3537
        break;
3538
    case float_round_to_zero:
3539
        break;
3540
    case float_round_up:
3541
        if (!extractFloat64Sign(make_float64(z))) {
3542
            z += roundBitsMask;
3543
        }
3544
        break;
3545
    case float_round_down:
3546
        if (extractFloat64Sign(make_float64(z))) {
3547
            z += roundBitsMask;
3548
        }
3549
        break;
3550
    default:
3551
        abort();
3552
    }
3553
    z &= ~ roundBitsMask;
3554
    if ( z != float64_val(a) )
3555
        STATUS(float_exception_flags) |= float_flag_inexact;
3556
    return make_float64(z);
3557

    
3558
}
3559

    
3560
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3561
{
3562
    int oldmode;
3563
    float64 res;
3564
    oldmode = STATUS(float_rounding_mode);
3565
    STATUS(float_rounding_mode) = float_round_to_zero;
3566
    res = float64_round_to_int(a STATUS_VAR);
3567
    STATUS(float_rounding_mode) = oldmode;
3568
    return res;
3569
}
3570

    
3571
/*----------------------------------------------------------------------------
3572
| Returns the result of adding the absolute values of the double-precision
3573
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3574
| before being returned.  `zSign' is ignored if the result is a NaN.
3575
| The addition is performed according to the IEC/IEEE Standard for Binary
3576
| Floating-Point Arithmetic.
3577
*----------------------------------------------------------------------------*/
3578

    
3579
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3580
{
3581
    int_fast16_t aExp, bExp, zExp;
3582
    uint64_t aSig, bSig, zSig;
3583
    int_fast16_t expDiff;
3584

    
3585
    aSig = extractFloat64Frac( a );
3586
    aExp = extractFloat64Exp( a );
3587
    bSig = extractFloat64Frac( b );
3588
    bExp = extractFloat64Exp( b );
3589
    expDiff = aExp - bExp;
3590
    aSig <<= 9;
3591
    bSig <<= 9;
3592
    if ( 0 < expDiff ) {
3593
        if ( aExp == 0x7FF ) {
3594
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3595
            return a;
3596
        }
3597
        if ( bExp == 0 ) {
3598
            --expDiff;
3599
        }
3600
        else {
3601
            bSig |= LIT64( 0x2000000000000000 );
3602
        }
3603
        shift64RightJamming( bSig, expDiff, &bSig );
3604
        zExp = aExp;
3605
    }
3606
    else if ( expDiff < 0 ) {
3607
        if ( bExp == 0x7FF ) {
3608
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3609
            return packFloat64( zSign, 0x7FF, 0 );
3610
        }
3611
        if ( aExp == 0 ) {
3612
            ++expDiff;
3613
        }
3614
        else {
3615
            aSig |= LIT64( 0x2000000000000000 );
3616
        }
3617
        shift64RightJamming( aSig, - expDiff, &aSig );
3618
        zExp = bExp;
3619
    }
3620
    else {
3621
        if ( aExp == 0x7FF ) {
3622
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3623
            return a;
3624
        }
3625
        if ( aExp == 0 ) {
3626
            if (STATUS(flush_to_zero)) {
3627
                if (aSig | bSig) {
3628
                    float_raise(float_flag_output_denormal STATUS_VAR);
3629
                }
3630
                return packFloat64(zSign, 0, 0);
3631
            }
3632
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3633
        }
3634
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3635
        zExp = aExp;
3636
        goto roundAndPack;
3637
    }
3638
    aSig |= LIT64( 0x2000000000000000 );
3639
    zSig = ( aSig + bSig )<<1;
3640
    --zExp;
3641
    if ( (int64_t) zSig < 0 ) {
3642
        zSig = aSig + bSig;
3643
        ++zExp;
3644
    }
3645
 roundAndPack:
3646
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3647

    
3648
}
3649

    
3650
/*----------------------------------------------------------------------------
3651
| Returns the result of subtracting the absolute values of the double-
3652
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3653
| difference is negated before being returned.  `zSign' is ignored if the
3654
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3655
| Standard for Binary Floating-Point Arithmetic.
3656
*----------------------------------------------------------------------------*/
3657

    
3658
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3659
{
3660
    int_fast16_t aExp, bExp, zExp;
3661
    uint64_t aSig, bSig, zSig;
3662
    int_fast16_t expDiff;
3663

    
3664
    aSig = extractFloat64Frac( a );
3665
    aExp = extractFloat64Exp( a );
3666
    bSig = extractFloat64Frac( b );
3667
    bExp = extractFloat64Exp( b );
3668
    expDiff = aExp - bExp;
3669
    aSig <<= 10;
3670
    bSig <<= 10;
3671
    if ( 0 < expDiff ) goto aExpBigger;
3672
    if ( expDiff < 0 ) goto bExpBigger;
3673
    if ( aExp == 0x7FF ) {
3674
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3675
        float_raise( float_flag_invalid STATUS_VAR);
3676
        return float64_default_nan;
3677
    }
3678
    if ( aExp == 0 ) {
3679
        aExp = 1;
3680
        bExp = 1;
3681
    }
3682
    if ( bSig < aSig ) goto aBigger;
3683
    if ( aSig < bSig ) goto bBigger;
3684
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3685
 bExpBigger:
3686
    if ( bExp == 0x7FF ) {
3687
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3688
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3689
    }
3690
    if ( aExp == 0 ) {
3691
        ++expDiff;
3692
    }
3693
    else {
3694
        aSig |= LIT64( 0x4000000000000000 );
3695
    }
3696
    shift64RightJamming( aSig, - expDiff, &aSig );
3697
    bSig |= LIT64( 0x4000000000000000 );
3698
 bBigger:
3699
    zSig = bSig - aSig;
3700
    zExp = bExp;
3701
    zSign ^= 1;
3702
    goto normalizeRoundAndPack;
3703
 aExpBigger:
3704
    if ( aExp == 0x7FF ) {
3705
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3706
        return a;
3707
    }
3708
    if ( bExp == 0 ) {
3709
        --expDiff;
3710
    }
3711
    else {
3712
        bSig |= LIT64( 0x4000000000000000 );
3713
    }
3714
    shift64RightJamming( bSig, expDiff, &bSig );
3715
    aSig |= LIT64( 0x4000000000000000 );
3716
 aBigger:
3717
    zSig = aSig - bSig;
3718
    zExp = aExp;
3719
 normalizeRoundAndPack:
3720
    --zExp;
3721
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3722

    
3723
}
3724

    
3725
/*----------------------------------------------------------------------------
3726
| Returns the result of adding the double-precision floating-point values `a'
3727
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3728
| Binary Floating-Point Arithmetic.
3729
*----------------------------------------------------------------------------*/
3730

    
3731
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3732
{
3733
    flag aSign, bSign;
3734
    a = float64_squash_input_denormal(a STATUS_VAR);
3735
    b = float64_squash_input_denormal(b STATUS_VAR);
3736

    
3737
    aSign = extractFloat64Sign( a );
3738
    bSign = extractFloat64Sign( b );
3739
    if ( aSign == bSign ) {
3740
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3741
    }
3742
    else {
3743
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3744
    }
3745

    
3746
}
3747

    
3748
/*----------------------------------------------------------------------------
3749
| Returns the result of subtracting the double-precision floating-point values
3750
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3751
| for Binary Floating-Point Arithmetic.
3752
*----------------------------------------------------------------------------*/
3753

    
3754
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3755
{
3756
    flag aSign, bSign;
3757
    a = float64_squash_input_denormal(a STATUS_VAR);
3758
    b = float64_squash_input_denormal(b STATUS_VAR);
3759

    
3760
    aSign = extractFloat64Sign( a );
3761
    bSign = extractFloat64Sign( b );
3762
    if ( aSign == bSign ) {
3763
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3764
    }
3765
    else {
3766
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3767
    }
3768

    
3769
}
3770

    
3771
/*----------------------------------------------------------------------------
3772
| Returns the result of multiplying the double-precision floating-point values
3773
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3774
| for Binary Floating-Point Arithmetic.
3775
*----------------------------------------------------------------------------*/
3776

    
3777
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3778
{
3779
    flag aSign, bSign, zSign;
3780
    int_fast16_t aExp, bExp, zExp;
3781
    uint64_t aSig, bSig, zSig0, zSig1;
3782

    
3783
    a = float64_squash_input_denormal(a STATUS_VAR);
3784
    b = float64_squash_input_denormal(b STATUS_VAR);
3785

    
3786
    aSig = extractFloat64Frac( a );
3787
    aExp = extractFloat64Exp( a );
3788
    aSign = extractFloat64Sign( a );
3789
    bSig = extractFloat64Frac( b );
3790
    bExp = extractFloat64Exp( b );
3791
    bSign = extractFloat64Sign( b );
3792
    zSign = aSign ^ bSign;
3793
    if ( aExp == 0x7FF ) {
3794
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3795
            return propagateFloat64NaN( a, b STATUS_VAR );
3796
        }
3797
        if ( ( bExp | bSig ) == 0 ) {
3798
            float_raise( float_flag_invalid STATUS_VAR);
3799
            return float64_default_nan;
3800
        }
3801
        return packFloat64( zSign, 0x7FF, 0 );
3802
    }
3803
    if ( bExp == 0x7FF ) {
3804
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3805
        if ( ( aExp | aSig ) == 0 ) {
3806
            float_raise( float_flag_invalid STATUS_VAR);
3807
            return float64_default_nan;
3808
        }
3809
        return packFloat64( zSign, 0x7FF, 0 );
3810
    }
3811
    if ( aExp == 0 ) {
3812
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3813
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3814
    }
3815
    if ( bExp == 0 ) {
3816
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3817
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3818
    }
3819
    zExp = aExp + bExp - 0x3FF;
3820
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3821
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3822
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3823
    zSig0 |= ( zSig1 != 0 );
3824
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3825
        zSig0 <<= 1;
3826
        --zExp;
3827
    }
3828
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3829

    
3830
}
3831

    
3832
/*----------------------------------------------------------------------------
3833
| Returns the result of dividing the double-precision floating-point value `a'
3834
| by the corresponding value `b'.  The operation is performed according to
3835
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3836
*----------------------------------------------------------------------------*/
3837

    
3838
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3839
{
3840
    flag aSign, bSign, zSign;
3841
    int_fast16_t aExp, bExp, zExp;
3842
    uint64_t aSig, bSig, zSig;
3843
    uint64_t rem0, rem1;
3844
    uint64_t term0, term1;
3845
    a = float64_squash_input_denormal(a STATUS_VAR);
3846
    b = float64_squash_input_denormal(b STATUS_VAR);
3847

    
3848
    aSig = extractFloat64Frac( a );
3849
    aExp = extractFloat64Exp( a );
3850
    aSign = extractFloat64Sign( a );
3851
    bSig = extractFloat64Frac( b );
3852
    bExp = extractFloat64Exp( b );
3853
    bSign = extractFloat64Sign( b );
3854
    zSign = aSign ^ bSign;
3855
    if ( aExp == 0x7FF ) {
3856
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3857
        if ( bExp == 0x7FF ) {
3858
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3859
            float_raise( float_flag_invalid STATUS_VAR);
3860
            return float64_default_nan;
3861
        }
3862
        return packFloat64( zSign, 0x7FF, 0 );
3863
    }
3864
    if ( bExp == 0x7FF ) {
3865
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3866
        return packFloat64( zSign, 0, 0 );
3867
    }
3868
    if ( bExp == 0 ) {
3869
        if ( bSig == 0 ) {
3870
            if ( ( aExp | aSig ) == 0 ) {
3871
                float_raise( float_flag_invalid STATUS_VAR);
3872
                return float64_default_nan;
3873
            }
3874
            float_raise( float_flag_divbyzero STATUS_VAR);
3875
            return packFloat64( zSign, 0x7FF, 0 );
3876
        }
3877
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3878
    }
3879
    if ( aExp == 0 ) {
3880
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3881
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3882
    }
3883
    zExp = aExp - bExp + 0x3FD;
3884
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3885
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3886
    if ( bSig <= ( aSig + aSig ) ) {
3887
        aSig >>= 1;
3888
        ++zExp;
3889
    }
3890
    zSig = estimateDiv128To64( aSig, 0, bSig );
3891
    if ( ( zSig & 0x1FF ) <= 2 ) {
3892
        mul64To128( bSig, zSig, &term0, &term1 );
3893
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3894
        while ( (int64_t) rem0 < 0 ) {
3895
            --zSig;
3896
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3897
        }
3898
        zSig |= ( rem1 != 0 );
3899
    }
3900
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3901

    
3902
}
3903

    
3904
/*----------------------------------------------------------------------------
3905
| Returns the remainder of the double-precision floating-point value `a'
3906
| with respect to the corresponding value `b'.  The operation is performed
3907
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3908
*----------------------------------------------------------------------------*/
3909

    
3910
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3911
{
3912
    flag aSign, zSign;
3913
    int_fast16_t aExp, bExp, expDiff;
3914
    uint64_t aSig, bSig;
3915
    uint64_t q, alternateASig;
3916
    int64_t sigMean;
3917

    
3918
    a = float64_squash_input_denormal(a STATUS_VAR);
3919
    b = float64_squash_input_denormal(b STATUS_VAR);
3920
    aSig = extractFloat64Frac( a );
3921
    aExp = extractFloat64Exp( a );
3922
    aSign = extractFloat64Sign( a );
3923
    bSig = extractFloat64Frac( b );
3924
    bExp = extractFloat64Exp( b );
3925
    if ( aExp == 0x7FF ) {
3926
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3927
            return propagateFloat64NaN( a, b STATUS_VAR );
3928
        }
3929
        float_raise( float_flag_invalid STATUS_VAR);
3930
        return float64_default_nan;
3931
    }
3932
    if ( bExp == 0x7FF ) {
3933
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3934
        return a;
3935
    }
3936
    if ( bExp == 0 ) {
3937
        if ( bSig == 0 ) {
3938
            float_raise( float_flag_invalid STATUS_VAR);
3939
            return float64_default_nan;
3940
        }
3941
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3942
    }
3943
    if ( aExp == 0 ) {
3944
        if ( aSig == 0 ) return a;
3945
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3946
    }
3947
    expDiff = aExp - bExp;
3948
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3949
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3950
    if ( expDiff < 0 ) {
3951
        if ( expDiff < -1 ) return a;
3952
        aSig >>= 1;
3953
    }
3954
    q = ( bSig <= aSig );
3955
    if ( q ) aSig -= bSig;
3956
    expDiff -= 64;
3957
    while ( 0 < expDiff ) {
3958
        q = estimateDiv128To64( aSig, 0, bSig );
3959
        q = ( 2 < q ) ? q - 2 : 0;
3960
        aSig = - ( ( bSig>>2 ) * q );
3961
        expDiff -= 62;
3962
    }
3963
    expDiff += 64;
3964
    if ( 0 < expDiff ) {
3965
        q = estimateDiv128To64( aSig, 0, bSig );
3966
        q = ( 2 < q ) ? q - 2 : 0;
3967
        q >>= 64 - expDiff;
3968
        bSig >>= 2;
3969
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3970
    }
3971
    else {
3972
        aSig >>= 2;
3973
        bSig >>= 2;
3974
    }
3975
    do {
3976
        alternateASig = aSig;
3977
        ++q;
3978
        aSig -= bSig;
3979
    } while ( 0 <= (int64_t) aSig );
3980
    sigMean = aSig + alternateASig;
3981
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3982
        aSig = alternateASig;
3983
    }
3984
    zSign = ( (int64_t) aSig < 0 );
3985
    if ( zSign ) aSig = - aSig;
3986
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3987

    
3988
}
3989

    
3990
/*----------------------------------------------------------------------------
3991
| Returns the result of multiplying the double-precision floating-point values
3992
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3993
| multiplication.  The operation is performed according to the IEC/IEEE
3994
| Standard for Binary Floating-Point Arithmetic 754-2008.
3995
| The flags argument allows the caller to select negation of the
3996
| addend, the intermediate product, or the final result. (The difference
3997
| between this and having the caller do a separate negation is that negating
3998
| externally will flip the sign bit on NaNs.)
3999
*----------------------------------------------------------------------------*/
4000

    
4001
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4002
{
4003
    flag aSign, bSign, cSign, zSign;
4004
    int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4005
    uint64_t aSig, bSig, cSig;
4006
    flag pInf, pZero, pSign;
4007
    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4008
    int shiftcount;
4009
    flag signflip, infzero;
4010

    
4011
    a = float64_squash_input_denormal(a STATUS_VAR);
4012
    b = float64_squash_input_denormal(b STATUS_VAR);
4013
    c = float64_squash_input_denormal(c STATUS_VAR);
4014
    aSig = extractFloat64Frac(a);
4015
    aExp = extractFloat64Exp(a);
4016
    aSign = extractFloat64Sign(a);
4017
    bSig = extractFloat64Frac(b);
4018
    bExp = extractFloat64Exp(b);
4019
    bSign = extractFloat64Sign(b);
4020
    cSig = extractFloat64Frac(c);
4021
    cExp = extractFloat64Exp(c);
4022
    cSign = extractFloat64Sign(c);
4023

    
4024
    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4025
               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4026

    
4027
    /* It is implementation-defined whether the cases of (0,inf,qnan)
4028
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4029
     * they return if they do), so we have to hand this information
4030
     * off to the target-specific pick-a-NaN routine.
4031
     */
4032
    if (((aExp == 0x7ff) && aSig) ||
4033
        ((bExp == 0x7ff) && bSig) ||
4034
        ((cExp == 0x7ff) && cSig)) {
4035
        return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4036
    }
4037

    
4038
    if (infzero) {
4039
        float_raise(float_flag_invalid STATUS_VAR);
4040
        return float64_default_nan;
4041
    }
4042

    
4043
    if (flags & float_muladd_negate_c) {
4044
        cSign ^= 1;
4045
    }
4046

    
4047
    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4048

    
4049
    /* Work out the sign and type of the product */
4050
    pSign = aSign ^ bSign;
4051
    if (flags & float_muladd_negate_product) {
4052
        pSign ^= 1;
4053
    }
4054
    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4055
    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4056

    
4057
    if (cExp == 0x7ff) {
4058
        if (pInf && (pSign ^ cSign)) {
4059
            /* addition of opposite-signed infinities => InvalidOperation */
4060
            float_raise(float_flag_invalid STATUS_VAR);
4061
            return float64_default_nan;
4062
        }
4063
        /* Otherwise generate an infinity of the same sign */
4064
        return packFloat64(cSign ^ signflip, 0x7ff, 0);
4065
    }
4066

    
4067
    if (pInf) {
4068
        return packFloat64(pSign ^ signflip, 0x7ff, 0);
4069
    }
4070

    
4071
    if (pZero) {
4072
        if (cExp == 0) {
4073
            if (cSig == 0) {
4074
                /* Adding two exact zeroes */
4075
                if (pSign == cSign) {
4076
                    zSign = pSign;
4077
                } else if (STATUS(float_rounding_mode) == float_round_down) {
4078
                    zSign = 1;
4079
                } else {
4080
                    zSign = 0;
4081
                }
4082
                return packFloat64(zSign ^ signflip, 0, 0);
4083
            }
4084
            /* Exact zero plus a denorm */
4085
            if (STATUS(flush_to_zero)) {
4086
                float_raise(float_flag_output_denormal STATUS_VAR);
4087
                return packFloat64(cSign ^ signflip, 0, 0);
4088
            }
4089
        }
4090
        /* Zero plus something non-zero : just return the something */
4091
        return packFloat64(cSign ^ signflip, cExp, cSig);
4092
    }
4093

    
4094
    if (aExp == 0) {
4095
        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4096
    }
4097
    if (bExp == 0) {
4098
        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4099
    }
4100

    
4101
    /* Calculate the actual result a * b + c */
4102

    
4103
    /* Multiply first; this is easy. */
4104
    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4105
     * because we want the true exponent, not the "one-less-than"
4106
     * flavour that roundAndPackFloat64() takes.
4107
     */
4108
    pExp = aExp + bExp - 0x3fe;
4109
    aSig = (aSig | LIT64(0x0010000000000000))<<10;
4110
    bSig = (bSig | LIT64(0x0010000000000000))<<11;
4111
    mul64To128(aSig, bSig, &pSig0, &pSig1);
4112
    if ((int64_t)(pSig0 << 1) >= 0) {
4113
        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4114
        pExp--;
4115
    }
4116

    
4117
    zSign = pSign ^ signflip;
4118

    
4119
    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4120
     * bit in position 126.
4121
     */
4122
    if (cExp == 0) {
4123
        if (!cSig) {
4124
            /* Throw out the special case of c being an exact zero now */
4125
            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4126
            return roundAndPackFloat64(zSign, pExp - 1,
4127
                                       pSig1 STATUS_VAR);
4128
        }
4129
        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4130
    }
4131

    
4132
    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4133
     * significand of the addend, with the explicit bit in position 126.
4134
     */
4135
    cSig0 = cSig << (126 - 64 - 52);
4136
    cSig1 = 0;
4137
    cSig0 |= LIT64(0x4000000000000000);
4138
    expDiff = pExp - cExp;
4139

    
4140
    if (pSign == cSign) {
4141
        /* Addition */
4142
        if (expDiff > 0) {
4143
            /* scale c to match p */
4144
            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4145
            zExp = pExp;
4146
        } else if (expDiff < 0) {
4147
            /* scale p to match c */
4148
            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4149
            zExp = cExp;
4150
        } else {
4151
            /* no scaling needed */
4152
            zExp = cExp;
4153
        }
4154
        /* Add significands and make sure explicit bit ends up in posn 126 */
4155
        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4156
        if ((int64_t)zSig0 < 0) {
4157
            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4158
        } else {
4159
            zExp--;
4160
        }
4161
        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4162
        return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4163
    } else {
4164
        /* Subtraction */
4165
        if (expDiff > 0) {
4166
            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4167
            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4168
            zExp = pExp;
4169
        } else if (expDiff < 0) {
4170
            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4171
            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4172
            zExp = cExp;
4173
            zSign ^= 1;
4174
        } else {
4175
            zExp = pExp;
4176
            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4177
                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4178
            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4179
                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4180
                zSign ^= 1;
4181
            } else {
4182
                /* Exact zero */
4183
                zSign = signflip;
4184
                if (STATUS(float_rounding_mode) == float_round_down) {
4185
                    zSign ^= 1;
4186
                }
4187
                return packFloat64(zSign, 0, 0);
4188
            }
4189
        }
4190
        --zExp;
4191
        /* Do the equivalent of normalizeRoundAndPackFloat64() but
4192
         * starting with the significand in a pair of uint64_t.
4193
         */
4194
        if (zSig0) {
4195
            shiftcount = countLeadingZeros64(zSig0) - 1;
4196
            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4197
            if (zSig1) {
4198
                zSig0 |= 1;
4199
            }
4200
            zExp -= shiftcount;
4201
        } else {
4202
            shiftcount = countLeadingZeros64(zSig1);
4203
            if (shiftcount == 0) {
4204
                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4205
                zExp -= 63;
4206
            } else {
4207
                shiftcount--;
4208
                zSig0 = zSig1 << shiftcount;
4209
                zExp -= (shiftcount + 64);
4210
            }
4211
        }
4212
        return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4213
    }
4214
}
4215

    
4216
/*----------------------------------------------------------------------------
4217
| Returns the square root of the double-precision floating-point value `a'.
4218
| The operation is performed according to the IEC/IEEE Standard for Binary
4219
| Floating-Point Arithmetic.
4220
*----------------------------------------------------------------------------*/
4221

    
4222
float64 float64_sqrt( float64 a STATUS_PARAM )
4223
{
4224
    flag aSign;
4225
    int_fast16_t aExp, zExp;
4226
    uint64_t aSig, zSig, doubleZSig;
4227
    uint64_t rem0, rem1, term0, term1;
4228
    a = float64_squash_input_denormal(a STATUS_VAR);
4229

    
4230
    aSig = extractFloat64Frac( a );
4231
    aExp = extractFloat64Exp( a );
4232
    aSign = extractFloat64Sign( a );
4233
    if ( aExp == 0x7FF ) {
4234
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4235
        if ( ! aSign ) return a;
4236
        float_raise( float_flag_invalid STATUS_VAR);
4237
        return float64_default_nan;
4238
    }
4239
    if ( aSign ) {
4240
        if ( ( aExp | aSig ) == 0 ) return a;
4241
        float_raise( float_flag_invalid STATUS_VAR);
4242
        return float64_default_nan;
4243
    }
4244
    if ( aExp == 0 ) {
4245
        if ( aSig == 0 ) return float64_zero;
4246
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4247
    }
4248
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4249
    aSig |= LIT64( 0x0010000000000000 );
4250
    zSig = estimateSqrt32( aExp, aSig>>21 );
4251
    aSig <<= 9 - ( aExp & 1 );
4252
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4253
    if ( ( zSig & 0x1FF ) <= 5 ) {
4254
        doubleZSig = zSig<<1;
4255
        mul64To128( zSig, zSig, &term0, &term1 );
4256
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4257
        while ( (int64_t) rem0 < 0 ) {
4258
            --zSig;
4259
            doubleZSig -= 2;
4260
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4261
        }
4262
        zSig |= ( ( rem0 | rem1 ) != 0 );
4263
    }
4264
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4265

    
4266
}
4267

    
4268
/*----------------------------------------------------------------------------
4269
| Returns the binary log of the double-precision floating-point value `a'.
4270
| The operation is performed according to the IEC/IEEE Standard for Binary
4271
| Floating-Point Arithmetic.
4272
*----------------------------------------------------------------------------*/
4273
float64 float64_log2( float64 a STATUS_PARAM )
4274
{
4275
    flag aSign, zSign;
4276
    int_fast16_t aExp;
4277
    uint64_t aSig, aSig0, aSig1, zSig, i;
4278
    a = float64_squash_input_denormal(a STATUS_VAR);
4279

    
4280
    aSig = extractFloat64Frac( a );
4281
    aExp = extractFloat64Exp( a );
4282
    aSign = extractFloat64Sign( a );
4283

    
4284
    if ( aExp == 0 ) {
4285
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4286
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4287
    }
4288
    if ( aSign ) {
4289
        float_raise( float_flag_invalid STATUS_VAR);
4290
        return float64_default_nan;
4291
    }
4292
    if ( aExp == 0x7FF ) {
4293
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4294
        return a;
4295
    }
4296

    
4297
    aExp -= 0x3FF;
4298
    aSig |= LIT64( 0x0010000000000000 );
4299
    zSign = aExp < 0;
4300
    zSig = (uint64_t)aExp << 52;
4301
    for (i = 1LL << 51; i > 0; i >>= 1) {
4302
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
4303
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4304
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
4305
            aSig >>= 1;
4306
            zSig |= i;
4307
        }
4308
    }
4309

    
4310
    if ( zSign )
4311
        zSig = -zSig;
4312
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4313
}
4314

    
4315
/*----------------------------------------------------------------------------
4316
| Returns 1 if the double-precision floating-point value `a' is equal to the
4317
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
4318
| if either operand is a NaN.  Otherwise, the comparison is performed
4319
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4320
*----------------------------------------------------------------------------*/
4321

    
4322
int float64_eq( float64 a, float64 b STATUS_PARAM )
4323
{
4324
    uint64_t av, bv;
4325
    a = float64_squash_input_denormal(a STATUS_VAR);
4326
    b = float64_squash_input_denormal(b STATUS_VAR);
4327

    
4328
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4329
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4330
       ) {
4331
        float_raise( float_flag_invalid STATUS_VAR);
4332
        return 0;
4333
    }
4334
    av = float64_val(a);
4335
    bv = float64_val(b);
4336
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4337

    
4338
}
4339

    
4340
/*----------------------------------------------------------------------------
4341
| Returns 1 if the double-precision floating-point value `a' is less than or
4342
| equal to the corresponding value `b', and 0 otherwise.  The invalid
4343
| exception is raised if either operand is a NaN.  The comparison is performed
4344
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4345
*----------------------------------------------------------------------------*/
4346

    
4347
int float64_le( float64 a, float64 b STATUS_PARAM )
4348
{
4349
    flag aSign, bSign;
4350
    uint64_t av, bv;
4351
    a = float64_squash_input_denormal(a STATUS_VAR);
4352
    b = float64_squash_input_denormal(b STATUS_VAR);
4353

    
4354
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4355
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4356
       ) {
4357
        float_raise( float_flag_invalid STATUS_VAR);
4358
        return 0;
4359
    }
4360
    aSign = extractFloat64Sign( a );
4361
    bSign = extractFloat64Sign( b );
4362
    av = float64_val(a);
4363
    bv = float64_val(b);
4364
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4365
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4366

    
4367
}
4368

    
4369
/*----------------------------------------------------------------------------
4370
| Returns 1 if the double-precision floating-point value `a' is less than
4371
| the corresponding value `b', and 0 otherwise.  The invalid exception is
4372
| raised if either operand is a NaN.  The comparison is performed according
4373
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374
*----------------------------------------------------------------------------*/
4375

    
4376
int float64_lt( float64 a, float64 b STATUS_PARAM )
4377
{
4378
    flag aSign, bSign;
4379
    uint64_t av, bv;
4380

    
4381
    a = float64_squash_input_denormal(a STATUS_VAR);
4382
    b = float64_squash_input_denormal(b STATUS_VAR);
4383
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4384
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4385
       ) {
4386
        float_raise( float_flag_invalid STATUS_VAR);
4387
        return 0;
4388
    }
4389
    aSign = extractFloat64Sign( a );
4390
    bSign = extractFloat64Sign( b );
4391
    av = float64_val(a);
4392
    bv = float64_val(b);
4393
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4394
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4395

    
4396
}
4397

    
4398
/*----------------------------------------------------------------------------
4399
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4400
| be compared, and 0 otherwise.  The invalid exception is raised if either
4401
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4402
| Standard for Binary Floating-Point Arithmetic.
4403
*----------------------------------------------------------------------------*/
4404

    
4405
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4406
{
4407
    a = float64_squash_input_denormal(a STATUS_VAR);
4408
    b = float64_squash_input_denormal(b STATUS_VAR);
4409

    
4410
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4411
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4412
       ) {
4413
        float_raise( float_flag_invalid STATUS_VAR);
4414
        return 1;
4415
    }
4416
    return 0;
4417
}
4418

    
4419
/*----------------------------------------------------------------------------
4420
| Returns 1 if the double-precision floating-point value `a' is equal to the
4421
| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4422
| exception.The comparison is performed according to the IEC/IEEE Standard
4423
| for Binary Floating-Point Arithmetic.
4424
*----------------------------------------------------------------------------*/
4425

    
4426
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4427
{
4428
    uint64_t av, bv;
4429
    a = float64_squash_input_denormal(a STATUS_VAR);
4430
    b = float64_squash_input_denormal(b STATUS_VAR);
4431

    
4432
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4433
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4434
       ) {
4435
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4436
            float_raise( float_flag_invalid STATUS_VAR);
4437
        }
4438
        return 0;
4439
    }
4440
    av = float64_val(a);
4441
    bv = float64_val(b);
4442
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4443

    
4444
}
4445

    
4446
/*----------------------------------------------------------------------------
4447
| Returns 1 if the double-precision floating-point value `a' is less than or
4448
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4449
| cause an exception.  Otherwise, the comparison is performed according to the
4450
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4451
*----------------------------------------------------------------------------*/
4452

    
4453
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4454
{
4455
    flag aSign, bSign;
4456
    uint64_t av, bv;
4457
    a = float64_squash_input_denormal(a STATUS_VAR);
4458
    b = float64_squash_input_denormal(b STATUS_VAR);
4459

    
4460
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4461
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4462
       ) {
4463
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4464
            float_raise( float_flag_invalid STATUS_VAR);
4465
        }
4466
        return 0;
4467
    }
4468
    aSign = extractFloat64Sign( a );
4469
    bSign = extractFloat64Sign( b );
4470
    av = float64_val(a);
4471
    bv = float64_val(b);
4472
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4473
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4474

    
4475
}
4476

    
4477
/*----------------------------------------------------------------------------
4478
| Returns 1 if the double-precision floating-point value `a' is less than
4479
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4480
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4481
| Standard for Binary Floating-Point Arithmetic.
4482
*----------------------------------------------------------------------------*/
4483

    
4484
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4485
{
4486
    flag aSign, bSign;
4487
    uint64_t av, bv;
4488
    a = float64_squash_input_denormal(a STATUS_VAR);
4489
    b = float64_squash_input_denormal(b STATUS_VAR);
4490

    
4491
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4492
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4493
       ) {
4494
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4495
            float_raise( float_flag_invalid STATUS_VAR);
4496
        }
4497
        return 0;
4498
    }
4499
    aSign = extractFloat64Sign( a );
4500
    bSign = extractFloat64Sign( b );
4501
    av = float64_val(a);
4502
    bv = float64_val(b);
4503
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4504
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4505

    
4506
}
4507

    
4508
/*----------------------------------------------------------------------------
4509
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4510
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4511
| comparison is performed according to the IEC/IEEE Standard for Binary
4512
| Floating-Point Arithmetic.
4513
*----------------------------------------------------------------------------*/
4514

    
4515
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4516
{
4517
    a = float64_squash_input_denormal(a STATUS_VAR);
4518
    b = float64_squash_input_denormal(b STATUS_VAR);
4519

    
4520
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4521
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4522
       ) {
4523
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4524
            float_raise( float_flag_invalid STATUS_VAR);
4525
        }
4526
        return 1;
4527
    }
4528
    return 0;
4529
}
4530

    
4531
/*----------------------------------------------------------------------------
4532
| Returns the result of converting the extended double-precision floating-
4533
| point value `a' to the 32-bit two's complement integer format.  The
4534
| conversion is performed according to the IEC/IEEE Standard for Binary
4535
| Floating-Point Arithmetic---which means in particular that the conversion
4536
| is rounded according to the current rounding mode.  If `a' is a NaN, the
4537
| largest positive integer is returned.  Otherwise, if the conversion
4538
| overflows, the largest integer with the same sign as `a' is returned.
4539
*----------------------------------------------------------------------------*/
4540

    
4541
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4542
{
4543
    flag aSign;
4544
    int32 aExp, shiftCount;
4545
    uint64_t aSig;
4546

    
4547
    aSig = extractFloatx80Frac( a );
4548
    aExp = extractFloatx80Exp( a );
4549
    aSign = extractFloatx80Sign( a );
4550
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4551
    shiftCount = 0x4037 - aExp;
4552
    if ( shiftCount <= 0 ) shiftCount = 1;
4553
    shift64RightJamming( aSig, shiftCount, &aSig );
4554
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
4555

    
4556
}
4557

    
4558
/*----------------------------------------------------------------------------
4559
| Returns the result of converting the extended double-precision floating-
4560
| point value `a' to the 32-bit two's complement integer format.  The
4561
| conversion is performed according to the IEC/IEEE Standard for Binary
4562
| Floating-Point Arithmetic, except that the conversion is always rounded
4563
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4564
| Otherwise, if the conversion overflows, the largest integer with the same
4565
| sign as `a' is returned.
4566
*----------------------------------------------------------------------------*/
4567

    
4568
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4569
{
4570
    flag aSign;
4571
    int32 aExp, shiftCount;
4572
    uint64_t aSig, savedASig;
4573
    int32_t z;
4574

    
4575
    aSig = extractFloatx80Frac( a );
4576
    aExp = extractFloatx80Exp( a );
4577
    aSign = extractFloatx80Sign( a );
4578
    if ( 0x401E < aExp ) {
4579
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4580
        goto invalid;
4581
    }
4582
    else if ( aExp < 0x3FFF ) {
4583
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4584
        return 0;
4585
    }
4586
    shiftCount = 0x403E - aExp;
4587
    savedASig = aSig;
4588
    aSig >>= shiftCount;
4589
    z = aSig;
4590
    if ( aSign ) z = - z;
4591
    if ( ( z < 0 ) ^ aSign ) {
4592
 invalid:
4593
        float_raise( float_flag_invalid STATUS_VAR);
4594
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4595
    }
4596
    if ( ( aSig<<shiftCount ) != savedASig ) {
4597
        STATUS(float_exception_flags) |= float_flag_inexact;
4598
    }
4599
    return z;
4600

    
4601
}
4602

    
4603
/*----------------------------------------------------------------------------
4604
| Returns the result of converting the extended double-precision floating-
4605
| point value `a' to the 64-bit two's complement integer format.  The
4606
| conversion is performed according to the IEC/IEEE Standard for Binary
4607
| Floating-Point Arithmetic---which means in particular that the conversion
4608
| is rounded according to the current rounding mode.  If `a' is a NaN,
4609
| the largest positive integer is returned.  Otherwise, if the conversion
4610
| overflows, the largest integer with the same sign as `a' is returned.
4611
*----------------------------------------------------------------------------*/
4612

    
4613
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4614
{
4615
    flag aSign;
4616
    int32 aExp, shiftCount;
4617
    uint64_t aSig, aSigExtra;
4618

    
4619
    aSig = extractFloatx80Frac( a );
4620
    aExp = extractFloatx80Exp( a );
4621
    aSign = extractFloatx80Sign( a );
4622
    shiftCount = 0x403E - aExp;
4623
    if ( shiftCount <= 0 ) {
4624
        if ( shiftCount ) {
4625
            float_raise( float_flag_invalid STATUS_VAR);
4626
            if (    ! aSign
4627
                 || (    ( aExp == 0x7FFF )
4628
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4629
               ) {
4630
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4631
            }
4632
            return (int64_t) LIT64( 0x8000000000000000 );
4633
        }
4634
        aSigExtra = 0;
4635
    }
4636
    else {
4637
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4638
    }
4639
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4640

    
4641
}
4642

    
4643
/*----------------------------------------------------------------------------
4644
| Returns the result of converting the extended double-precision floating-
4645
| point value `a' to the 64-bit two's complement integer format.  The
4646
| conversion is performed according to the IEC/IEEE Standard for Binary
4647
| Floating-Point Arithmetic, except that the conversion is always rounded
4648
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4649
| Otherwise, if the conversion overflows, the largest integer with the same
4650
| sign as `a' is returned.
4651
*----------------------------------------------------------------------------*/
4652

    
4653
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4654
{
4655
    flag aSign;
4656
    int32 aExp, shiftCount;
4657
    uint64_t aSig;
4658
    int64 z;
4659

    
4660
    aSig = extractFloatx80Frac( a );
4661
    aExp = extractFloatx80Exp( a );
4662
    aSign = extractFloatx80Sign( a );
4663
    shiftCount = aExp - 0x403E;
4664
    if ( 0 <= shiftCount ) {
4665
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4666
        if ( ( a.high != 0xC03E ) || aSig ) {
4667
            float_raise( float_flag_invalid STATUS_VAR);
4668
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4669
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4670
            }
4671
        }
4672
        return (int64_t) LIT64( 0x8000000000000000 );
4673
    }
4674
    else if ( aExp < 0x3FFF ) {
4675
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4676
        return 0;
4677
    }
4678
    z = aSig>>( - shiftCount );
4679
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4680
        STATUS(float_exception_flags) |= float_flag_inexact;
4681
    }
4682
    if ( aSign ) z = - z;
4683
    return z;
4684

    
4685
}
4686

    
4687
/*----------------------------------------------------------------------------
4688
| Returns the result of converting the extended double-precision floating-
4689
| point value `a' to the single-precision floating-point format.  The
4690
| conversion is performed according to the IEC/IEEE Standard for Binary
4691
| Floating-Point Arithmetic.
4692
*----------------------------------------------------------------------------*/
4693

    
4694
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4695
{
4696
    flag aSign;
4697
    int32 aExp;
4698
    uint64_t aSig;
4699

    
4700
    aSig = extractFloatx80Frac( a );
4701
    aExp = extractFloatx80Exp( a );
4702
    aSign = extractFloatx80Sign( a );
4703
    if ( aExp == 0x7FFF ) {
4704
        if ( (uint64_t) ( aSig<<1 ) ) {
4705
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4706
        }
4707
        return packFloat32( aSign, 0xFF, 0 );
4708
    }
4709
    shift64RightJamming( aSig, 33, &aSig );
4710
    if ( aExp || aSig ) aExp -= 0x3F81;
4711
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4712

    
4713
}
4714

    
4715
/*----------------------------------------------------------------------------
4716
| Returns the result of converting the extended double-precision floating-
4717
| point value `a' to the double-precision floating-point format.  The
4718
| conversion is performed according to the IEC/IEEE Standard for Binary
4719
| Floating-Point Arithmetic.
4720
*----------------------------------------------------------------------------*/
4721

    
4722
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4723
{
4724
    flag aSign;
4725
    int32 aExp;
4726
    uint64_t aSig, zSig;
4727

    
4728
    aSig = extractFloatx80Frac( a );
4729
    aExp = extractFloatx80Exp( a );
4730
    aSign = extractFloatx80Sign( a );
4731
    if ( aExp == 0x7FFF ) {
4732
        if ( (uint64_t) ( aSig<<1 ) ) {
4733
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4734
        }
4735
        return packFloat64( aSign, 0x7FF, 0 );
4736
    }
4737
    shift64RightJamming( aSig, 1, &zSig );
4738
    if ( aExp || aSig ) aExp -= 0x3C01;
4739
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4740

    
4741
}
4742

    
4743
/*----------------------------------------------------------------------------
4744
| Returns the result of converting the extended double-precision floating-
4745
| point value `a' to the quadruple-precision floating-point format.  The
4746
| conversion is performed according to the IEC/IEEE Standard for Binary
4747
| Floating-Point Arithmetic.
4748
*----------------------------------------------------------------------------*/
4749

    
4750
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4751
{
4752
    flag aSign;
4753
    int_fast16_t aExp;
4754
    uint64_t aSig, zSig0, zSig1;
4755

    
4756
    aSig = extractFloatx80Frac( a );
4757
    aExp = extractFloatx80Exp( a );
4758
    aSign = extractFloatx80Sign( a );
4759
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4760
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4761
    }
4762
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4763
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4764

    
4765
}
4766

    
4767
/*----------------------------------------------------------------------------
4768
| Rounds the extended double-precision floating-point value `a' to an integer,
4769
| and returns the result as an extended quadruple-precision floating-point
4770
| value.  The operation is performed according to the IEC/IEEE Standard for
4771
| Binary Floating-Point Arithmetic.
4772
*----------------------------------------------------------------------------*/
4773

    
4774
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4775
{
4776
    flag aSign;
4777
    int32 aExp;
4778
    uint64_t lastBitMask, roundBitsMask;
4779
    floatx80 z;
4780

    
4781
    aExp = extractFloatx80Exp( a );
4782
    if ( 0x403E <= aExp ) {
4783
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4784
            return propagateFloatx80NaN( a, a STATUS_VAR );
4785
        }
4786
        return a;
4787
    }
4788
    if ( aExp < 0x3FFF ) {
4789
        if (    ( aExp == 0 )
4790
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4791
            return a;
4792
        }
4793
        STATUS(float_exception_flags) |= float_flag_inexact;
4794
        aSign = extractFloatx80Sign( a );
4795
        switch ( STATUS(float_rounding_mode) ) {
4796
         case float_round_nearest_even:
4797
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4798
               ) {
4799
                return
4800
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4801
            }
4802
            break;
4803
        case float_round_ties_away:
4804
            if (aExp == 0x3FFE) {
4805
                return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4806
            }
4807
            break;
4808
         case float_round_down:
4809
            return
4810
                  aSign ?
4811
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4812
                : packFloatx80( 0, 0, 0 );
4813
         case float_round_up:
4814
            return
4815
                  aSign ? packFloatx80( 1, 0, 0 )
4816
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4817
        }
4818
        return packFloatx80( aSign, 0, 0 );
4819
    }
4820
    lastBitMask = 1;
4821
    lastBitMask <<= 0x403E - aExp;
4822
    roundBitsMask = lastBitMask - 1;
4823
    z = a;
4824
    switch (STATUS(float_rounding_mode)) {
4825
    case float_round_nearest_even:
4826
        z.low += lastBitMask>>1;
4827
        if ((z.low & roundBitsMask) == 0) {
4828
            z.low &= ~lastBitMask;
4829
        }
4830
        break;
4831
    case float_round_ties_away:
4832
        z.low += lastBitMask >> 1;
4833
        break;
4834
    case float_round_to_zero:
4835
        break;
4836
    case float_round_up:
4837
        if (!extractFloatx80Sign(z)) {
4838
            z.low += roundBitsMask;
4839
        }
4840
        break;
4841
    case float_round_down:
4842
        if (extractFloatx80Sign(z)) {
4843
            z.low += roundBitsMask;
4844
        }
4845
        break;
4846
    default:
4847
        abort();
4848
    }
4849
    z.low &= ~ roundBitsMask;
4850
    if ( z.low == 0 ) {
4851
        ++z.high;
4852
        z.low = LIT64( 0x8000000000000000 );
4853
    }
4854
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4855
    return z;
4856

    
4857
}
4858

    
4859
/*----------------------------------------------------------------------------
4860
| Returns the result of adding the absolute values of the extended double-
4861
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4862
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4863
| The addition is performed according to the IEC/IEEE Standard for Binary
4864
| Floating-Point Arithmetic.
4865
*----------------------------------------------------------------------------*/
4866

    
4867
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4868
{
4869
    int32 aExp, bExp, zExp;
4870
    uint64_t aSig, bSig, zSig0, zSig1;
4871
    int32 expDiff;
4872

    
4873
    aSig = extractFloatx80Frac( a );
4874
    aExp = extractFloatx80Exp( a );
4875
    bSig = extractFloatx80Frac( b );
4876
    bExp = extractFloatx80Exp( b );
4877
    expDiff = aExp - bExp;
4878
    if ( 0 < expDiff ) {
4879
        if ( aExp == 0x7FFF ) {
4880
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4881
            return a;
4882
        }
4883
        if ( bExp == 0 ) --expDiff;
4884
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4885
        zExp = aExp;
4886
    }
4887
    else if ( expDiff < 0 ) {
4888
        if ( bExp == 0x7FFF ) {
4889
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4890
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4891
        }
4892
        if ( aExp == 0 ) ++expDiff;
4893
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4894
        zExp = bExp;
4895
    }
4896
    else {
4897
        if ( aExp == 0x7FFF ) {
4898
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4899
                return propagateFloatx80NaN( a, b STATUS_VAR );
4900
            }
4901
            return a;
4902
        }
4903
        zSig1 = 0;
4904
        zSig0 = aSig + bSig;
4905
        if ( aExp == 0 ) {
4906
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4907
            goto roundAndPack;
4908
        }
4909
        zExp = aExp;
4910
        goto shiftRight1;
4911
    }
4912
    zSig0 = aSig + bSig;
4913
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4914
 shiftRight1:
4915
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4916
    zSig0 |= LIT64( 0x8000000000000000 );
4917
    ++zExp;
4918
 roundAndPack:
4919
    return
4920
        roundAndPackFloatx80(
4921
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4922

    
4923
}
4924

    
4925
/*----------------------------------------------------------------------------
4926
| Returns the result of subtracting the absolute values of the extended
4927
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4928
| difference is negated before being returned.  `zSign' is ignored if the
4929
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4930
| Standard for Binary Floating-Point Arithmetic.
4931
*----------------------------------------------------------------------------*/
4932

    
4933
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4934
{
4935
    int32 aExp, bExp, zExp;
4936
    uint64_t aSig, bSig, zSig0, zSig1;
4937
    int32 expDiff;
4938
    floatx80 z;
4939

    
4940
    aSig = extractFloatx80Frac( a );
4941
    aExp = extractFloatx80Exp( a );
4942
    bSig = extractFloatx80Frac( b );
4943
    bExp = extractFloatx80Exp( b );
4944
    expDiff = aExp - bExp;
4945
    if ( 0 < expDiff ) goto aExpBigger;
4946
    if ( expDiff < 0 ) goto bExpBigger;
4947
    if ( aExp == 0x7FFF ) {
4948
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4949
            return propagateFloatx80NaN( a, b STATUS_VAR );
4950
        }
4951
        float_raise( float_flag_invalid STATUS_VAR);
4952
        z.low = floatx80_default_nan_low;
4953
        z.high = floatx80_default_nan_high;
4954
        return z;
4955
    }
4956
    if ( aExp == 0 ) {
4957
        aExp = 1;
4958
        bExp = 1;
4959
    }
4960
    zSig1 = 0;
4961
    if ( bSig < aSig ) goto aBigger;
4962
    if ( aSig < bSig ) goto bBigger;
4963
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4964
 bExpBigger:
4965
    if ( bExp == 0x7FFF ) {
4966
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4967
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4968
    }
4969
    if ( aExp == 0 ) ++expDiff;
4970
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4971
 bBigger:
4972
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4973
    zExp = bExp;
4974
    zSign ^= 1;
4975
    goto normalizeRoundAndPack;
4976
 aExpBigger:
4977
    if ( aExp == 0x7FFF ) {
4978
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4979
        return a;
4980
    }
4981
    if ( bExp == 0 ) --expDiff;
4982
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4983
 aBigger:
4984
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4985
    zExp = aExp;
4986
 normalizeRoundAndPack:
4987
    return
4988
        normalizeRoundAndPackFloatx80(
4989
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4990

    
4991
}
4992

    
4993
/*----------------------------------------------------------------------------
4994
| Returns the result of adding the extended double-precision floating-point
4995
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4996
| Standard for Binary Floating-Point Arithmetic.
4997
*----------------------------------------------------------------------------*/
4998

    
4999
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5000
{
5001
    flag aSign, bSign;
5002

    
5003
    aSign = extractFloatx80Sign( a );
5004
    bSign = extractFloatx80Sign( b );
5005
    if ( aSign == bSign ) {
5006
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5007
    }
5008
    else {
5009
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5010
    }
5011

    
5012
}
5013

    
5014
/*----------------------------------------------------------------------------
5015
| Returns the result of subtracting the extended double-precision floating-
5016
| point values `a' and `b'.  The operation is performed according to the
5017
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5018
*----------------------------------------------------------------------------*/
5019

    
5020
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5021
{
5022
    flag aSign, bSign;
5023

    
5024
    aSign = extractFloatx80Sign( a );
5025
    bSign = extractFloatx80Sign( b );
5026
    if ( aSign == bSign ) {
5027
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5028
    }
5029
    else {
5030
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5031
    }
5032

    
5033
}
5034

    
5035
/*----------------------------------------------------------------------------
5036
| Returns the result of multiplying the extended double-precision floating-
5037
| point values `a' and `b'.  The operation is performed according to the
5038
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039
*----------------------------------------------------------------------------*/
5040

    
5041
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5042
{
5043
    flag aSign, bSign, zSign;
5044
    int32 aExp, bExp, zExp;
5045
    uint64_t aSig, bSig, zSig0, zSig1;
5046
    floatx80 z;
5047

    
5048
    aSig = extractFloatx80Frac( a );
5049
    aExp = extractFloatx80Exp( a );
5050
    aSign = extractFloatx80Sign( a );
5051
    bSig = extractFloatx80Frac( b );
5052
    bExp = extractFloatx80Exp( b );
5053
    bSign = extractFloatx80Sign( b );
5054
    zSign = aSign ^ bSign;
5055
    if ( aExp == 0x7FFF ) {
5056
        if (    (uint64_t) ( aSig<<1 )
5057
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5058
            return propagateFloatx80NaN( a, b STATUS_VAR );
5059
        }
5060
        if ( ( bExp | bSig ) == 0 ) goto invalid;
5061
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5062
    }
5063
    if ( bExp == 0x7FFF ) {
5064
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5065
        if ( ( aExp | aSig ) == 0 ) {
5066
 invalid:
5067
            float_raise( float_flag_invalid STATUS_VAR);
5068
            z.low = floatx80_default_nan_low;
5069
            z.high = floatx80_default_nan_high;
5070
            return z;
5071
        }
5072
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5073
    }
5074
    if ( aExp == 0 ) {
5075
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5076
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5077
    }
5078
    if ( bExp == 0 ) {
5079
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5080
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5081
    }
5082
    zExp = aExp + bExp - 0x3FFE;
5083
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
5084
    if ( 0 < (int64_t) zSig0 ) {
5085
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5086
        --zExp;
5087
    }
5088
    return
5089
        roundAndPackFloatx80(
5090
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5091

    
5092
}
5093

    
5094
/*----------------------------------------------------------------------------
5095
| Returns the result of dividing the extended double-precision floating-point
5096
| value `a' by the corresponding value `b'.  The operation is performed
5097
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098
*----------------------------------------------------------------------------*/
5099

    
5100
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5101
{
5102
    flag aSign, bSign, zSign;
5103
    int32 aExp, bExp, zExp;
5104
    uint64_t aSig, bSig, zSig0, zSig1;
5105
    uint64_t rem0, rem1, rem2, term0, term1, term2;
5106
    floatx80 z;
5107

    
5108
    aSig = extractFloatx80Frac( a );
5109
    aExp = extractFloatx80Exp( a );
5110
    aSign = extractFloatx80Sign( a );
5111
    bSig = extractFloatx80Frac( b );
5112
    bExp = extractFloatx80Exp( b );
5113
    bSign = extractFloatx80Sign( b );
5114
    zSign = aSign ^ bSign;
5115
    if ( aExp == 0x7FFF ) {
5116
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5117
        if ( bExp == 0x7FFF ) {
5118
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5119
            goto invalid;
5120
        }
5121
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5122
    }
5123
    if ( bExp == 0x7FFF ) {
5124
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5125
        return packFloatx80( zSign, 0, 0 );
5126
    }
5127
    if ( bExp == 0 ) {
5128
        if ( bSig == 0 ) {
5129
            if ( ( aExp | aSig ) == 0 ) {
5130
 invalid:
5131
                float_raise( float_flag_invalid STATUS_VAR);
5132
                z.low = floatx80_default_nan_low;
5133
                z.high = floatx80_default_nan_high;
5134
                return z;
5135
            }
5136
            float_raise( float_flag_divbyzero STATUS_VAR);
5137
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5138
        }
5139
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5140
    }
5141
    if ( aExp == 0 ) {
5142
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5143
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5144
    }
5145
    zExp = aExp - bExp + 0x3FFE;
5146
    rem1 = 0;
5147
    if ( bSig <= aSig ) {
5148
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
5149
        ++zExp;
5150
    }
5151
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5152
    mul64To128( bSig, zSig0, &term0, &term1 );
5153
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5154
    while ( (int64_t) rem0 < 0 ) {
5155
        --zSig0;
5156
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5157
    }
5158
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
5159
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5160
        mul64To128( bSig, zSig1, &term1, &term2 );
5161
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5162
        while ( (int64_t) rem1 < 0 ) {
5163
            --zSig1;
5164
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5165
        }
5166
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
5167
    }
5168
    return
5169
        roundAndPackFloatx80(
5170
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5171

    
5172
}
5173

    
5174
/*----------------------------------------------------------------------------
5175
| Returns the remainder of the extended double-precision floating-point value
5176
| `a' with respect to the corresponding value `b'.  The operation is performed
5177
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5178
*----------------------------------------------------------------------------*/
5179

    
5180
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5181
{
5182
    flag aSign, zSign;
5183
    int32 aExp, bExp, expDiff;
5184
    uint64_t aSig0, aSig1, bSig;
5185
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
5186
    floatx80 z;
5187

    
5188
    aSig0 = extractFloatx80Frac( a );
5189
    aExp = extractFloatx80Exp( a );
5190
    aSign = extractFloatx80Sign( a );
5191
    bSig = extractFloatx80Frac( b );
5192
    bExp = extractFloatx80Exp( b );
5193
    if ( aExp == 0x7FFF ) {
5194
        if (    (uint64_t) ( aSig0<<1 )
5195
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5196
            return propagateFloatx80NaN( a, b STATUS_VAR );
5197
        }
5198
        goto invalid;
5199
    }
5200
    if ( bExp == 0x7FFF ) {
5201
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5202
        return a;
5203
    }
5204
    if ( bExp == 0 ) {
5205
        if ( bSig == 0 ) {
5206
 invalid:
5207
            float_raise( float_flag_invalid STATUS_VAR);
5208
            z.low = floatx80_default_nan_low;
5209
            z.high = floatx80_default_nan_high;
5210
            return z;
5211
        }
5212
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5213
    }
5214
    if ( aExp == 0 ) {
5215
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5216
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5217
    }
5218
    bSig |= LIT64( 0x8000000000000000 );
5219
    zSign = aSign;
5220
    expDiff = aExp - bExp;
5221
    aSig1 = 0;
5222
    if ( expDiff < 0 ) {
5223
        if ( expDiff < -1 ) return a;
5224
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5225
        expDiff = 0;
5226
    }
5227
    q = ( bSig <= aSig0 );
5228
    if ( q ) aSig0 -= bSig;
5229
    expDiff -= 64;
5230
    while ( 0 < expDiff ) {
5231
        q = estimateDiv128To64( aSig0, aSig1, bSig );
5232
        q = ( 2 < q ) ? q - 2 : 0;
5233
        mul64To128( bSig, q, &term0, &term1 );
5234
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5235
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5236
        expDiff -= 62;
5237
    }
5238
    expDiff += 64;
5239
    if ( 0 < expDiff ) {
5240
        q = estimateDiv128To64( aSig0, aSig1, bSig );
5241
        q = ( 2 < q ) ? q - 2 : 0;
5242
        q >>= 64 - expDiff;
5243
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5244
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5246
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
5247
            ++q;
5248
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5249
        }
5250
    }
5251
    else {
5252
        term1 = 0;
5253
        term0 = bSig;
5254
    }
5255
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5256
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5257
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5258
              && ( q & 1 ) )
5259
       ) {
5260
        aSig0 = alternateASig0;
5261
        aSig1 = alternateASig1;
5262
        zSign = ! zSign;
5263
    }
5264
    return
5265
        normalizeRoundAndPackFloatx80(
5266
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5267

    
5268
}
5269

    
5270
/*----------------------------------------------------------------------------
5271
| Returns the square root of the extended double-precision floating-point
5272
| value `a'.  The operation is performed according to the IEC/IEEE Standard
5273
| for Binary Floating-Point Arithmetic.
5274
*----------------------------------------------------------------------------*/
5275

    
5276
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5277
{
5278
    flag aSign;
5279
    int32 aExp, zExp;
5280
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5281
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5282
    floatx80 z;
5283

    
5284
    aSig0 = extractFloatx80Frac( a );
5285
    aExp = extractFloatx80Exp( a );
5286
    aSign = extractFloatx80Sign( a );
5287
    if ( aExp == 0x7FFF ) {
5288
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5289
        if ( ! aSign ) return a;
5290
        goto invalid;
5291
    }
5292
    if ( aSign ) {
5293
        if ( ( aExp | aSig0 ) == 0 ) return a;
5294
 invalid:
5295
        float_raise( float_flag_invalid STATUS_VAR);
5296
        z.low = floatx80_default_nan_low;
5297
        z.high = floatx80_default_nan_high;
5298
        return z;
5299
    }
5300
    if ( aExp == 0 ) {
5301
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5302
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5303
    }
5304
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5305
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5306
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5307
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5308
    doubleZSig0 = zSig0<<1;
5309
    mul64To128( zSig0, zSig0, &term0, &term1 );
5310
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5311
    while ( (int64_t) rem0 < 0 ) {
5312
        --zSig0;
5313
        doubleZSig0 -= 2;
5314
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5315
    }
5316
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5317
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5318
        if ( zSig1 == 0 ) zSig1 = 1;
5319
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5320
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5321
        mul64To128( zSig1, zSig1, &term2, &term3 );
5322
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5323
        while ( (int64_t) rem1 < 0 ) {
5324
            --zSig1;
5325
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5326
            term3 |= 1;
5327
            term2 |= doubleZSig0;
5328
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5329
        }
5330
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5331
    }
5332
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5333
    zSig0 |= doubleZSig0;
5334
    return
5335
        roundAndPackFloatx80(
5336
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5337

    
5338
}
5339

    
5340
/*----------------------------------------------------------------------------
5341
| Returns 1 if the extended double-precision floating-point value `a' is equal
5342
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5343
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5344
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5345
*----------------------------------------------------------------------------*/
5346

    
5347
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5348
{
5349

    
5350
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5351
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5352
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5353
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5354
       ) {
5355
        float_raise( float_flag_invalid STATUS_VAR);
5356
        return 0;
5357
    }
5358
    return
5359
           ( a.low == b.low )
5360
        && (    ( a.high == b.high )
5361
             || (    ( a.low == 0 )
5362
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5363
           );
5364

    
5365
}
5366

    
5367
/*----------------------------------------------------------------------------
5368
| Returns 1 if the extended double-precision floating-point value `a' is
5369
| less than or equal to the corresponding value `b', and 0 otherwise.  The
5370
| invalid exception is raised if either operand is a NaN.  The comparison is
5371
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5372
| Arithmetic.
5373
*----------------------------------------------------------------------------*/
5374

    
5375
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5376
{
5377
    flag aSign, bSign;
5378

    
5379
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5380
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5381
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5382
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5383
       ) {
5384
        float_raise( float_flag_invalid STATUS_VAR);
5385
        return 0;
5386
    }
5387
    aSign = extractFloatx80Sign( a );
5388
    bSign = extractFloatx80Sign( b );
5389
    if ( aSign != bSign ) {
5390
        return
5391
               aSign
5392
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5393
                 == 0 );
5394
    }
5395
    return
5396
          aSign ? le128( b.high, b.low, a.high, a.low )
5397
        : le128( a.high, a.low, b.high, b.low );
5398

    
5399
}
5400

    
5401
/*----------------------------------------------------------------------------
5402
| Returns 1 if the extended double-precision floating-point value `a' is
5403
| less than the corresponding value `b', and 0 otherwise.  The invalid
5404
| exception is raised if either operand is a NaN.  The comparison is performed
5405
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5406
*----------------------------------------------------------------------------*/
5407

    
5408
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5409
{
5410
    flag aSign, bSign;
5411

    
5412
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5413
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5414
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5415
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5416
       ) {
5417
        float_raise( float_flag_invalid STATUS_VAR);
5418
        return 0;
5419
    }
5420
    aSign = extractFloatx80Sign( a );
5421
    bSign = extractFloatx80Sign( b );
5422
    if ( aSign != bSign ) {
5423
        return
5424
               aSign
5425
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5426
                 != 0 );
5427
    }
5428
    return
5429
          aSign ? lt128( b.high, b.low, a.high, a.low )
5430
        : lt128( a.high, a.low, b.high, b.low );
5431

    
5432
}
5433

    
5434
/*----------------------------------------------------------------------------
5435
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5436
| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5437
| either operand is a NaN.   The comparison is performed according to the
5438
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5439
*----------------------------------------------------------------------------*/
5440
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5441
{
5442
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5443
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5444
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5445
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5446
       ) {
5447
        float_raise( float_flag_invalid STATUS_VAR);
5448
        return 1;
5449
    }
5450
    return 0;
5451
}
5452

    
5453
/*----------------------------------------------------------------------------
5454
| Returns 1 if the extended double-precision floating-point value `a' is
5455
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5456
| cause an exception.  The comparison is performed according to the IEC/IEEE
5457
| Standard for Binary Floating-Point Arithmetic.
5458
*----------------------------------------------------------------------------*/
5459

    
5460
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5461
{
5462

    
5463
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5464
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5465
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5466
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5467
       ) {
5468
        if (    floatx80_is_signaling_nan( a )
5469
             || floatx80_is_signaling_nan( b ) ) {
5470
            float_raise( float_flag_invalid STATUS_VAR);
5471
        }
5472
        return 0;
5473
    }
5474
    return
5475
           ( a.low == b.low )
5476
        && (    ( a.high == b.high )
5477
             || (    ( a.low == 0 )
5478
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5479
           );
5480

    
5481
}
5482

    
5483
/*----------------------------------------------------------------------------
5484
| Returns 1 if the extended double-precision floating-point value `a' is less
5485
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5486
| do not cause an exception.  Otherwise, the comparison is performed according
5487
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5488
*----------------------------------------------------------------------------*/
5489

    
5490
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5491
{
5492
    flag aSign, bSign;
5493

    
5494
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5495
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5496
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5497
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5498
       ) {
5499
        if (    floatx80_is_signaling_nan( a )
5500
             || floatx80_is_signaling_nan( b ) ) {
5501
            float_raise( float_flag_invalid STATUS_VAR);
5502
        }
5503
        return 0;
5504
    }
5505
    aSign = extractFloatx80Sign( a );
5506
    bSign = extractFloatx80Sign( b );
5507
    if ( aSign != bSign ) {
5508
        return
5509
               aSign
5510
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5511
                 == 0 );
5512
    }
5513
    return
5514
          aSign ? le128( b.high, b.low, a.high, a.low )
5515
        : le128( a.high, a.low, b.high, b.low );
5516

    
5517
}
5518

    
5519
/*----------------------------------------------------------------------------
5520
| Returns 1 if the extended double-precision floating-point value `a' is less
5521
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5522
| an exception.  Otherwise, the comparison is performed according to the
5523
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5524
*----------------------------------------------------------------------------*/
5525

    
5526
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5527
{
5528
    flag aSign, bSign;
5529

    
5530
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5531
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5532
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5533
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5534
       ) {
5535
        if (    floatx80_is_signaling_nan( a )
5536
             || floatx80_is_signaling_nan( b ) ) {
5537
            float_raise( float_flag_invalid STATUS_VAR);
5538
        }
5539
        return 0;
5540
    }
5541
    aSign = extractFloatx80Sign( a );
5542
    bSign = extractFloatx80Sign( b );
5543
    if ( aSign != bSign ) {
5544
        return
5545
               aSign
5546
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5547
                 != 0 );
5548
    }
5549
    return
5550
          aSign ? lt128( b.high, b.low, a.high, a.low )
5551
        : lt128( a.high, a.low, b.high, b.low );
5552

    
5553
}
5554

    
5555
/*----------------------------------------------------------------------------
5556
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5557
| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5558
| The comparison is performed according to the IEC/IEEE Standard for Binary
5559
| Floating-Point Arithmetic.
5560
*----------------------------------------------------------------------------*/
5561
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5562
{
5563
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5564
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5565
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5566
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5567
       ) {
5568
        if (    floatx80_is_signaling_nan( a )
5569
             || floatx80_is_signaling_nan( b ) ) {
5570
            float_raise( float_flag_invalid STATUS_VAR);
5571
        }
5572
        return 1;
5573
    }
5574
    return 0;
5575
}
5576

    
5577
/*----------------------------------------------------------------------------
5578
| Returns the result of converting the quadruple-precision floating-point
5579
| value `a' to the 32-bit two's complement integer format.  The conversion
5580
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5581
| Arithmetic---which means in particular that the conversion is rounded
5582
| according to the current rounding mode.  If `a' is a NaN, the largest
5583
| positive integer is returned.  Otherwise, if the conversion overflows, the
5584
| largest integer with the same sign as `a' is returned.
5585
*----------------------------------------------------------------------------*/
5586

    
5587
int32 float128_to_int32( float128 a STATUS_PARAM )
5588
{
5589
    flag aSign;
5590
    int32 aExp, shiftCount;
5591
    uint64_t aSig0, aSig1;
5592

    
5593
    aSig1 = extractFloat128Frac1( a );
5594
    aSig0 = extractFloat128Frac0( a );
5595
    aExp = extractFloat128Exp( a );
5596
    aSign = extractFloat128Sign( a );
5597
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5598
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5599
    aSig0 |= ( aSig1 != 0 );
5600
    shiftCount = 0x4028 - aExp;
5601
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5602
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5603

    
5604
}
5605

    
5606
/*----------------------------------------------------------------------------
5607
| Returns the result of converting the quadruple-precision floating-point
5608
| value `a' to the 32-bit two's complement integer format.  The conversion
5609
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5610
| Arithmetic, except that the conversion is always rounded toward zero.  If
5611
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5612
| conversion overflows, the largest integer with the same sign as `a' is
5613
| returned.
5614
*----------------------------------------------------------------------------*/
5615

    
5616
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5617
{
5618
    flag aSign;
5619
    int32 aExp, shiftCount;
5620
    uint64_t aSig0, aSig1, savedASig;
5621
    int32_t z;
5622

    
5623
    aSig1 = extractFloat128Frac1( a );
5624
    aSig0 = extractFloat128Frac0( a );
5625
    aExp = extractFloat128Exp( a );
5626
    aSign = extractFloat128Sign( a );
5627
    aSig0 |= ( aSig1 != 0 );
5628
    if ( 0x401E < aExp ) {
5629
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5630
        goto invalid;
5631
    }
5632
    else if ( aExp < 0x3FFF ) {
5633
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5634
        return 0;
5635
    }
5636
    aSig0 |= LIT64( 0x0001000000000000 );
5637
    shiftCount = 0x402F - aExp;
5638
    savedASig = aSig0;
5639
    aSig0 >>= shiftCount;
5640
    z = aSig0;
5641
    if ( aSign ) z = - z;
5642
    if ( ( z < 0 ) ^ aSign ) {
5643
 invalid:
5644
        float_raise( float_flag_invalid STATUS_VAR);
5645
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5646
    }
5647
    if ( ( aSig0<<shiftCount ) != savedASig ) {
5648
        STATUS(float_exception_flags) |= float_flag_inexact;
5649
    }
5650
    return z;
5651

    
5652
}
5653

    
5654
/*----------------------------------------------------------------------------
5655
| Returns the result of converting the quadruple-precision floating-point
5656
| value `a' to the 64-bit two's complement integer format.  The conversion
5657
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5658
| Arithmetic---which means in particular that the conversion is rounded
5659
| according to the current rounding mode.  If `a' is a NaN, the largest
5660
| positive integer is returned.  Otherwise, if the conversion overflows, the
5661
| largest integer with the same sign as `a' is returned.
5662
*----------------------------------------------------------------------------*/
5663

    
5664
int64 float128_to_int64( float128 a STATUS_PARAM )
5665
{
5666
    flag aSign;
5667
    int32 aExp, shiftCount;
5668
    uint64_t aSig0, aSig1;
5669

    
5670
    aSig1 = extractFloat128Frac1( a );
5671
    aSig0 = extractFloat128Frac0( a );
5672
    aExp = extractFloat128Exp( a );
5673
    aSign = extractFloat128Sign( a );
5674
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5675
    shiftCount = 0x402F - aExp;
5676
    if ( shiftCount <= 0 ) {
5677
        if ( 0x403E < aExp ) {
5678
            float_raise( float_flag_invalid STATUS_VAR);
5679
            if (    ! aSign
5680
                 || (    ( aExp == 0x7FFF )
5681
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5682
                    )
5683
               ) {
5684
                return LIT64( 0x7FFFFFFFFFFFFFFF );
5685
            }
5686
            return (int64_t) LIT64( 0x8000000000000000 );
5687
        }
5688
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5689
    }
5690
    else {
5691
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5692
    }
5693
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5694

    
5695
}
5696

    
5697
/*----------------------------------------------------------------------------
5698
| Returns the result of converting the quadruple-precision floating-point
5699
| value `a' to the 64-bit two's complement integer format.  The conversion
5700
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5701
| Arithmetic, except that the conversion is always rounded toward zero.
5702
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5703
| the conversion overflows, the largest integer with the same sign as `a' is
5704
| returned.
5705
*----------------------------------------------------------------------------*/
5706

    
5707
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5708
{
5709
    flag aSign;
5710
    int32 aExp, shiftCount;
5711
    uint64_t aSig0, aSig1;
5712
    int64 z;
5713

    
5714
    aSig1 = extractFloat128Frac1( a );
5715
    aSig0 = extractFloat128Frac0( a );
5716
    aExp = extractFloat128Exp( a );
5717
    aSign = extractFloat128Sign( a );
5718
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5719
    shiftCount = aExp - 0x402F;
5720
    if ( 0 < shiftCount ) {
5721
        if ( 0x403E <= aExp ) {
5722
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5723
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5724
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5725
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5726
            }
5727
            else {
5728
                float_raise( float_flag_invalid STATUS_VAR);
5729
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5730
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5731
                }
5732
            }
5733
            return (int64_t) LIT64( 0x8000000000000000 );
5734
        }
5735
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5736
        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5737
            STATUS(float_exception_flags) |= float_flag_inexact;
5738
        }
5739
    }
5740
    else {
5741
        if ( aExp < 0x3FFF ) {
5742
            if ( aExp | aSig0 | aSig1 ) {
5743
                STATUS(float_exception_flags) |= float_flag_inexact;
5744
            }
5745
            return 0;
5746
        }
5747
        z = aSig0>>( - shiftCount );
5748
        if (    aSig1
5749
             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5750
            STATUS(float_exception_flags) |= float_flag_inexact;
5751
        }
5752
    }
5753
    if ( aSign ) z = - z;
5754
    return z;
5755

    
5756
}
5757

    
5758
/*----------------------------------------------------------------------------
5759
| Returns the result of converting the quadruple-precision floating-point
5760
| value `a' to the single-precision floating-point format.  The conversion
5761
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5762
| Arithmetic.
5763
*----------------------------------------------------------------------------*/
5764

    
5765
float32 float128_to_float32( float128 a STATUS_PARAM )
5766
{
5767
    flag aSign;
5768
    int32 aExp;
5769
    uint64_t aSig0, aSig1;
5770
    uint32_t zSig;
5771

    
5772
    aSig1 = extractFloat128Frac1( a );
5773
    aSig0 = extractFloat128Frac0( a );
5774
    aExp = extractFloat128Exp( a );
5775
    aSign = extractFloat128Sign( a );
5776
    if ( aExp == 0x7FFF ) {
5777
        if ( aSig0 | aSig1 ) {
5778
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5779
        }
5780
        return packFloat32( aSign, 0xFF, 0 );
5781
    }
5782
    aSig0 |= ( aSig1 != 0 );
5783
    shift64RightJamming( aSig0, 18, &aSig0 );
5784
    zSig = aSig0;
5785
    if ( aExp || zSig ) {
5786
        zSig |= 0x40000000;
5787
        aExp -= 0x3F81;
5788
    }
5789
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5790

    
5791
}
5792

    
5793
/*----------------------------------------------------------------------------
5794
| Returns the result of converting the quadruple-precision floating-point
5795
| value `a' to the double-precision floating-point format.  The conversion
5796
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5797
| Arithmetic.
5798
*----------------------------------------------------------------------------*/
5799

    
5800
float64 float128_to_float64( float128 a STATUS_PARAM )
5801
{
5802
    flag aSign;
5803
    int32 aExp;
5804
    uint64_t aSig0, aSig1;
5805

    
5806
    aSig1 = extractFloat128Frac1( a );
5807
    aSig0 = extractFloat128Frac0( a );
5808
    aExp = extractFloat128Exp( a );
5809
    aSign = extractFloat128Sign( a );
5810
    if ( aExp == 0x7FFF ) {
5811
        if ( aSig0 | aSig1 ) {
5812
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5813
        }
5814
        return packFloat64( aSign, 0x7FF, 0 );
5815
    }
5816
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5817
    aSig0 |= ( aSig1 != 0 );
5818
    if ( aExp || aSig0 ) {
5819
        aSig0 |= LIT64( 0x4000000000000000 );
5820
        aExp -= 0x3C01;
5821
    }
5822
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5823

    
5824
}
5825

    
5826
/*----------------------------------------------------------------------------
5827
| Returns the result of converting the quadruple-precision floating-point
5828
| value `a' to the extended double-precision floating-point format.  The
5829
| conversion is performed according to the IEC/IEEE Standard for Binary
5830
| Floating-Point Arithmetic.
5831
*----------------------------------------------------------------------------*/
5832

    
5833
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5834
{
5835
    flag aSign;
5836
    int32 aExp;
5837
    uint64_t aSig0, aSig1;
5838

    
5839
    aSig1 = extractFloat128Frac1( a );
5840
    aSig0 = extractFloat128Frac0( a );
5841
    aExp = extractFloat128Exp( a );
5842
    aSign = extractFloat128Sign( a );
5843
    if ( aExp == 0x7FFF ) {
5844
        if ( aSig0 | aSig1 ) {
5845
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5846
        }
5847
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5848
    }
5849
    if ( aExp == 0 ) {
5850
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5851
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5852
    }
5853
    else {
5854
        aSig0 |= LIT64( 0x0001000000000000 );
5855
    }
5856
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5857
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5858

    
5859
}
5860

    
5861
/*----------------------------------------------------------------------------
5862
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5863
| returns the result as a quadruple-precision floating-point value.  The
5864
| operation is performed according to the IEC/IEEE Standard for Binary
5865
| Floating-Point Arithmetic.
5866
*----------------------------------------------------------------------------*/
5867

    
5868
float128 float128_round_to_int( float128 a STATUS_PARAM )
5869
{
5870
    flag aSign;
5871
    int32 aExp;
5872
    uint64_t lastBitMask, roundBitsMask;
5873
    float128 z;
5874

    
5875
    aExp = extractFloat128Exp( a );
5876
    if ( 0x402F <= aExp ) {
5877
        if ( 0x406F <= aExp ) {
5878
            if (    ( aExp == 0x7FFF )
5879
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5880
               ) {
5881
                return propagateFloat128NaN( a, a STATUS_VAR );
5882
            }
5883
            return a;
5884
        }
5885
        lastBitMask = 1;
5886
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5887
        roundBitsMask = lastBitMask - 1;
5888
        z = a;
5889
        switch (STATUS(float_rounding_mode)) {
5890
        case float_round_nearest_even:
5891
            if ( lastBitMask ) {
5892
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5893
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5894
            }
5895
            else {
5896
                if ( (int64_t) z.low < 0 ) {
5897
                    ++z.high;
5898
                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5899
                }
5900
            }
5901
            break;
5902
        case float_round_ties_away:
5903
            if (lastBitMask) {
5904
                add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5905
            } else {
5906
                if ((int64_t) z.low < 0) {
5907
                    ++z.high;
5908
                }
5909
            }
5910
            break;
5911
        case float_round_to_zero:
5912
            break;
5913
        case float_round_up:
5914
            if (!extractFloat128Sign(z)) {
5915
                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5916
            }
5917
            break;
5918
        case float_round_down:
5919
            if (extractFloat128Sign(z)) {
5920
                add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5921
            }
5922
            break;
5923
        default:
5924
            abort();
5925
        }
5926
        z.low &= ~ roundBitsMask;
5927
    }
5928
    else {
5929
        if ( aExp < 0x3FFF ) {
5930
            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5931
            STATUS(float_exception_flags) |= float_flag_inexact;
5932
            aSign = extractFloat128Sign( a );
5933
            switch ( STATUS(float_rounding_mode) ) {
5934
             case float_round_nearest_even:
5935
                if (    ( aExp == 0x3FFE )
5936
                     && (   extractFloat128Frac0( a )
5937
                          | extractFloat128Frac1( a ) )
5938
                   ) {
5939
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5940
                }
5941
                break;
5942
            case float_round_ties_away:
5943
                if (aExp == 0x3FFE) {
5944
                    return packFloat128(aSign, 0x3FFF, 0, 0);
5945
                }
5946
                break;
5947
             case float_round_down:
5948
                return
5949
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5950
                    : packFloat128( 0, 0, 0, 0 );
5951
             case float_round_up:
5952
                return
5953
                      aSign ? packFloat128( 1, 0, 0, 0 )
5954
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5955
            }
5956
            return packFloat128( aSign, 0, 0, 0 );
5957
        }
5958
        lastBitMask = 1;
5959
        lastBitMask <<= 0x402F - aExp;
5960
        roundBitsMask = lastBitMask - 1;
5961
        z.low = 0;
5962
        z.high = a.high;
5963
        switch (STATUS(float_rounding_mode)) {
5964
        case float_round_nearest_even:
5965
            z.high += lastBitMask>>1;
5966
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5967
                z.high &= ~ lastBitMask;
5968
            }
5969
            break;
5970
        case float_round_ties_away:
5971
            z.high += lastBitMask>>1;
5972
            break;
5973
        case float_round_to_zero:
5974
            break;
5975
        case float_round_up:
5976
            if (!extractFloat128Sign(z)) {
5977
                z.high |= ( a.low != 0 );
5978
                z.high += roundBitsMask;
5979
            }
5980
            break;
5981
        case float_round_down:
5982
            if (extractFloat128Sign(z)) {
5983
                z.high |= (a.low != 0);
5984
                z.high += roundBitsMask;
5985
            }
5986
            break;
5987
        default:
5988
            abort();
5989
        }
5990
        z.high &= ~ roundBitsMask;
5991
    }
5992
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5993
        STATUS(float_exception_flags) |= float_flag_inexact;
5994
    }
5995
    return z;
5996

    
5997
}
5998

    
5999
/*----------------------------------------------------------------------------
6000
| Returns the result of adding the absolute values of the quadruple-precision
6001
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6002
| before being returned.  `zSign' is ignored if the result is a NaN.
6003
| The addition is performed according to the IEC/IEEE Standard for Binary
6004
| Floating-Point Arithmetic.
6005
*----------------------------------------------------------------------------*/
6006

    
6007
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6008
{
6009
    int32 aExp, bExp, zExp;
6010
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6011
    int32 expDiff;
6012

    
6013
    aSig1 = extractFloat128Frac1( a );
6014
    aSig0 = extractFloat128Frac0( a );
6015
    aExp = extractFloat128Exp( a );
6016
    bSig1 = extractFloat128Frac1( b );
6017
    bSig0 = extractFloat128Frac0( b );
6018
    bExp = extractFloat128Exp( b );
6019
    expDiff = aExp - bExp;
6020
    if ( 0 < expDiff ) {
6021
        if ( aExp == 0x7FFF ) {
6022
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6023
            return a;
6024
        }
6025
        if ( bExp == 0 ) {
6026
            --expDiff;
6027
        }
6028
        else {
6029
            bSig0 |= LIT64( 0x0001000000000000 );
6030
        }
6031
        shift128ExtraRightJamming(
6032
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6033
        zExp = aExp;
6034
    }
6035
    else if ( expDiff < 0 ) {
6036
        if ( bExp == 0x7FFF ) {
6037
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6038
            return packFloat128( zSign, 0x7FFF, 0, 0 );
6039
        }
6040
        if ( aExp == 0 ) {
6041
            ++expDiff;
6042
        }
6043
        else {
6044
            aSig0 |= LIT64( 0x0001000000000000 );
6045
        }
6046
        shift128ExtraRightJamming(
6047
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6048
        zExp = bExp;
6049
    }
6050
    else {
6051
        if ( aExp == 0x7FFF ) {
6052
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6053
                return propagateFloat128NaN( a, b STATUS_VAR );
6054
            }
6055
            return a;
6056
        }
6057
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6058
        if ( aExp == 0 ) {
6059
            if (STATUS(flush_to_zero)) {
6060
                if (zSig0 | zSig1) {
6061
                    float_raise(float_flag_output_denormal STATUS_VAR);
6062
                }
6063
                return packFloat128(zSign, 0, 0, 0);
6064
            }
6065
            return packFloat128( zSign, 0, zSig0, zSig1 );
6066
        }
6067
        zSig2 = 0;
6068
        zSig0 |= LIT64( 0x0002000000000000 );
6069
        zExp = aExp;
6070
        goto shiftRight1;
6071
    }
6072
    aSig0 |= LIT64( 0x0001000000000000 );
6073
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6074
    --zExp;
6075
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6076
    ++zExp;
6077
 shiftRight1:
6078
    shift128ExtraRightJamming(
6079
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6080
 roundAndPack:
6081
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6082

    
6083
}
6084

    
6085
/*----------------------------------------------------------------------------
6086
| Returns the result of subtracting the absolute values of the quadruple-
6087
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
6088
| difference is negated before being returned.  `zSign' is ignored if the
6089
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
6090
| Standard for Binary Floating-Point Arithmetic.
6091
*----------------------------------------------------------------------------*/
6092

    
6093
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6094
{
6095
    int32 aExp, bExp, zExp;
6096
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6097
    int32 expDiff;
6098
    float128 z;
6099

    
6100
    aSig1 = extractFloat128Frac1( a );
6101
    aSig0 = extractFloat128Frac0( a );
6102
    aExp = extractFloat128Exp( a );
6103
    bSig1 = extractFloat128Frac1( b );
6104
    bSig0 = extractFloat128Frac0( b );
6105
    bExp = extractFloat128Exp( b );
6106
    expDiff = aExp - bExp;
6107
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6108
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6109
    if ( 0 < expDiff ) goto aExpBigger;
6110
    if ( expDiff < 0 ) goto bExpBigger;
6111
    if ( aExp == 0x7FFF ) {
6112
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6113
            return propagateFloat128NaN( a, b STATUS_VAR );
6114
        }
6115
        float_raise( float_flag_invalid STATUS_VAR);
6116
        z.low = float128_default_nan_low;
6117
        z.high = float128_default_nan_high;
6118
        return z;
6119
    }
6120
    if ( aExp == 0 ) {
6121
        aExp = 1;
6122
        bExp = 1;
6123
    }
6124
    if ( bSig0 < aSig0 ) goto aBigger;
6125
    if ( aSig0 < bSig0 ) goto bBigger;
6126
    if ( bSig1 < aSig1 ) goto aBigger;
6127
    if ( aSig1 < bSig1 ) goto bBigger;
6128
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6129
 bExpBigger:
6130
    if ( bExp == 0x7FFF ) {
6131
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6132
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6133
    }
6134
    if ( aExp == 0 ) {
6135
        ++expDiff;
6136
    }
6137
    else {
6138
        aSig0 |= LIT64( 0x4000000000000000 );
6139
    }
6140
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6141
    bSig0 |= LIT64( 0x4000000000000000 );
6142
 bBigger:
6143
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6144
    zExp = bExp;
6145
    zSign ^= 1;
6146
    goto normalizeRoundAndPack;
6147
 aExpBigger:
6148
    if ( aExp == 0x7FFF ) {
6149
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6150
        return a;
6151
    }
6152
    if ( bExp == 0 ) {
6153
        --expDiff;
6154
    }
6155
    else {
6156
        bSig0 |= LIT64( 0x4000000000000000 );
6157
    }
6158
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6159
    aSig0 |= LIT64( 0x4000000000000000 );
6160
 aBigger:
6161
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6162
    zExp = aExp;
6163
 normalizeRoundAndPack:
6164
    --zExp;
6165
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6166

    
6167
}
6168

    
6169
/*----------------------------------------------------------------------------
6170
| Returns the result of adding the quadruple-precision floating-point values
6171
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6172
| for Binary Floating-Point Arithmetic.
6173
*----------------------------------------------------------------------------*/
6174

    
6175
float128 float128_add( float128 a, float128 b STATUS_PARAM )
6176
{
6177
    flag aSign, bSign;
6178

    
6179
    aSign = extractFloat128Sign( a );
6180
    bSign = extractFloat128Sign( b );
6181
    if ( aSign == bSign ) {
6182
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
6183
    }
6184
    else {
6185
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
6186
    }
6187

    
6188
}
6189

    
6190
/*----------------------------------------------------------------------------
6191
| Returns the result of subtracting the quadruple-precision floating-point
6192
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6193
| Standard for Binary Floating-Point Arithmetic.
6194
*----------------------------------------------------------------------------*/
6195

    
6196
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6197
{
6198
    flag aSign, bSign;
6199

    
6200
    aSign = extractFloat128Sign( a );
6201
    bSign = extractFloat128Sign( b );
6202
    if ( aSign == bSign ) {
6203
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
6204
    }
6205
    else {
6206
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
6207
    }
6208

    
6209
}
6210

    
6211
/*----------------------------------------------------------------------------
6212
| Returns the result of multiplying the quadruple-precision floating-point
6213
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
6214
| Standard for Binary Floating-Point Arithmetic.
6215
*----------------------------------------------------------------------------*/
6216

    
6217
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6218
{
6219
    flag aSign, bSign, zSign;
6220
    int32 aExp, bExp, zExp;
6221
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6222
    float128 z;
6223

    
6224
    aSig1 = extractFloat128Frac1( a );
6225
    aSig0 = extractFloat128Frac0( a );
6226
    aExp = extractFloat128Exp( a );
6227
    aSign = extractFloat128Sign( a );
6228
    bSig1 = extractFloat128Frac1( b );
6229
    bSig0 = extractFloat128Frac0( b );
6230
    bExp = extractFloat128Exp( b );
6231
    bSign = extractFloat128Sign( b );
6232
    zSign = aSign ^ bSign;
6233
    if ( aExp == 0x7FFF ) {
6234
        if (    ( aSig0 | aSig1 )
6235
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6236
            return propagateFloat128NaN( a, b STATUS_VAR );
6237
        }
6238
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6239
        return packFloat128( zSign, 0x7FFF, 0, 0 );
6240
    }
6241
    if ( bExp == 0x7FFF ) {
6242
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6243
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6244
 invalid:
6245
            float_raise( float_flag_invalid STATUS_VAR);
6246
            z.low = float128_default_nan_low;
6247
            z.high = float128_default_nan_high;
6248
            return z;
6249
        }
6250
        return packFloat128( zSign, 0x7FFF, 0, 0 );
6251
    }
6252
    if ( aExp == 0 ) {
6253
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6254
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6255
    }
6256
    if ( bExp == 0 ) {
6257
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6258
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6259
    }
6260
    zExp = aExp + bExp - 0x4000;
6261
    aSig0 |= LIT64( 0x0001000000000000 );
6262
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6263
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6264
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6265
    zSig2 |= ( zSig3 != 0 );
6266
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6267
        shift128ExtraRightJamming(
6268
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6269
        ++zExp;
6270
    }
6271
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6272

    
6273
}
6274

    
6275
/*----------------------------------------------------------------------------
6276
| Returns the result of dividing the quadruple-precision floating-point value
6277
| `a' by the corresponding value `b'.  The operation is performed according to
6278
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6279
*----------------------------------------------------------------------------*/
6280

    
6281
float128 float128_div( float128 a, float128 b STATUS_PARAM )
6282
{
6283
    flag aSign, bSign, zSign;
6284
    int32 aExp, bExp, zExp;
6285
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6286
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6287
    float128 z;
6288

    
6289
    aSig1 = extractFloat128Frac1( a );
6290
    aSig0 = extractFloat128Frac0( a );
6291
    aExp = extractFloat128Exp( a );
6292
    aSign = extractFloat128Sign( a );
6293
    bSig1 = extractFloat128Frac1( b );
6294
    bSig0 = extractFloat128Frac0( b );
6295
    bExp = extractFloat128Exp( b );
6296
    bSign = extractFloat128Sign( b );
6297
    zSign = aSign ^ bSign;
6298
    if ( aExp == 0x7FFF ) {
6299
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6300
        if ( bExp == 0x7FFF ) {
6301
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6302
            goto invalid;
6303
        }
6304
        return packFloat128( zSign, 0x7FFF, 0, 0 );
6305
    }
6306
    if ( bExp == 0x7FFF ) {
6307
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6308
        return packFloat128( zSign, 0, 0, 0 );
6309
    }
6310
    if ( bExp == 0 ) {
6311
        if ( ( bSig0 | bSig1 ) == 0 ) {
6312
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6313
 invalid:
6314
                float_raise( float_flag_invalid STATUS_VAR);
6315
                z.low = float128_default_nan_low;
6316
                z.high = float128_default_nan_high;
6317
                return z;
6318
            }
6319
            float_raise( float_flag_divbyzero STATUS_VAR);
6320
            return packFloat128( zSign, 0x7FFF, 0, 0 );
6321
        }
6322
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6323
    }
6324
    if ( aExp == 0 ) {
6325
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6326
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6327
    }
6328
    zExp = aExp - bExp + 0x3FFD;
6329
    shortShift128Left(
6330
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6331
    shortShift128Left(
6332
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6333
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6334
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6335
        ++zExp;
6336
    }
6337
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6338
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6339
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6340
    while ( (int64_t) rem0 < 0 ) {
6341
        --zSig0;
6342
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6343
    }
6344
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6345
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6346
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6347
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6348
        while ( (int64_t) rem1 < 0 ) {
6349
            --zSig1;
6350
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6351
        }
6352
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6353
    }
6354
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6355
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6356

    
6357
}
6358

    
6359
/*----------------------------------------------------------------------------
6360
| Returns the remainder of the quadruple-precision floating-point value `a'
6361
| with respect to the corresponding value `b'.  The operation is performed
6362
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6363
*----------------------------------------------------------------------------*/
6364

    
6365
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6366
{
6367
    flag aSign, zSign;
6368
    int32 aExp, bExp, expDiff;
6369
    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6370
    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6371
    int64_t sigMean0;
6372
    float128 z;
6373

    
6374
    aSig1 = extractFloat128Frac1( a );
6375
    aSig0 = extractFloat128Frac0( a );
6376
    aExp = extractFloat128Exp( a );
6377
    aSign = extractFloat128Sign( a );
6378
    bSig1 = extractFloat128Frac1( b );
6379
    bSig0 = extractFloat128Frac0( b );
6380
    bExp = extractFloat128Exp( b );
6381
    if ( aExp == 0x7FFF ) {
6382
        if (    ( aSig0 | aSig1 )
6383
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6384
            return propagateFloat128NaN( a, b STATUS_VAR );
6385
        }
6386
        goto invalid;
6387
    }
6388
    if ( bExp == 0x7FFF ) {
6389
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6390
        return a;
6391
    }
6392
    if ( bExp == 0 ) {
6393
        if ( ( bSig0 | bSig1 ) == 0 ) {
6394
 invalid:
6395
            float_raise( float_flag_invalid STATUS_VAR);
6396
            z.low = float128_default_nan_low;
6397
            z.high = float128_default_nan_high;
6398
            return z;
6399
        }
6400
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6401
    }
6402
    if ( aExp == 0 ) {
6403
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6404
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6405
    }
6406
    expDiff = aExp - bExp;
6407
    if ( expDiff < -1 ) return a;
6408
    shortShift128Left(
6409
        aSig0 | LIT64( 0x0001000000000000 ),
6410
        aSig1,
6411
        15 - ( expDiff < 0 ),
6412
        &aSig0,
6413
        &aSig1
6414
    );
6415
    shortShift128Left(
6416
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6417
    q = le128( bSig0, bSig1, aSig0, aSig1 );
6418
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6419
    expDiff -= 64;
6420
    while ( 0 < expDiff ) {
6421
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6422
        q = ( 4 < q ) ? q - 4 : 0;
6423
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6424
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6425
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6426
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6427
        expDiff -= 61;
6428
    }
6429
    if ( -64 < expDiff ) {
6430
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6431
        q = ( 4 < q ) ? q - 4 : 0;
6432
        q >>= - expDiff;
6433
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6434
        expDiff += 52;
6435
        if ( expDiff < 0 ) {
6436
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6437
        }
6438
        else {
6439
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6440
        }
6441
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6442
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6443
    }
6444
    else {
6445
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6446
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6447
    }
6448
    do {
6449
        alternateASig0 = aSig0;
6450
        alternateASig1 = aSig1;
6451
        ++q;
6452
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6453
    } while ( 0 <= (int64_t) aSig0 );
6454
    add128(
6455
        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6456
    if (    ( sigMean0 < 0 )
6457
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6458
        aSig0 = alternateASig0;
6459
        aSig1 = alternateASig1;
6460
    }
6461
    zSign = ( (int64_t) aSig0 < 0 );
6462
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6463
    return
6464
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6465

    
6466
}
6467

    
6468
/*----------------------------------------------------------------------------
6469
| Returns the square root of the quadruple-precision floating-point value `a'.
6470
| The operation is performed according to the IEC/IEEE Standard for Binary
6471
| Floating-Point Arithmetic.
6472
*----------------------------------------------------------------------------*/
6473

    
6474
float128 float128_sqrt( float128 a STATUS_PARAM )
6475
{
6476
    flag aSign;
6477
    int32 aExp, zExp;
6478
    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6479
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6480
    float128 z;
6481

    
6482
    aSig1 = extractFloat128Frac1( a );
6483
    aSig0 = extractFloat128Frac0( a );
6484
    aExp = extractFloat128Exp( a );
6485
    aSign = extractFloat128Sign( a );
6486
    if ( aExp == 0x7FFF ) {
6487
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6488
        if ( ! aSign ) return a;
6489
        goto invalid;
6490
    }
6491
    if ( aSign ) {
6492
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6493
 invalid:
6494
        float_raise( float_flag_invalid STATUS_VAR);
6495
        z.low = float128_default_nan_low;
6496
        z.high = float128_default_nan_high;
6497
        return z;
6498
    }
6499
    if ( aExp == 0 ) {
6500
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6501
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6502
    }
6503
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6504
    aSig0 |= LIT64( 0x0001000000000000 );
6505
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6506
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6507
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6508
    doubleZSig0 = zSig0<<1;
6509
    mul64To128( zSig0, zSig0, &term0, &term1 );
6510
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6511
    while ( (int64_t) rem0 < 0 ) {
6512
        --zSig0;
6513
        doubleZSig0 -= 2;
6514
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6515
    }
6516
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6517
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6518
        if ( zSig1 == 0 ) zSig1 = 1;
6519
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6520
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6521
        mul64To128( zSig1, zSig1, &term2, &term3 );
6522
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6523
        while ( (int64_t) rem1 < 0 ) {
6524
            --zSig1;
6525
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6526
            term3 |= 1;
6527
            term2 |= doubleZSig0;
6528
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6529
        }
6530
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6531
    }
6532
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6533
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6534

    
6535
}
6536

    
6537
/*----------------------------------------------------------------------------
6538
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6539
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6540
| raised if either operand is a NaN.  Otherwise, the comparison is performed
6541
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6542
*----------------------------------------------------------------------------*/
6543

    
6544
int float128_eq( float128 a, float128 b STATUS_PARAM )
6545
{
6546

    
6547
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6548
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6549
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6550
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6551
       ) {
6552
        float_raise( float_flag_invalid STATUS_VAR);
6553
        return 0;
6554
    }
6555
    return
6556
           ( a.low == b.low )
6557
        && (    ( a.high == b.high )
6558
             || (    ( a.low == 0 )
6559
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6560
           );
6561

    
6562
}
6563

    
6564
/*----------------------------------------------------------------------------
6565
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6566
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6567
| exception is raised if either operand is a NaN.  The comparison is performed
6568
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6569
*----------------------------------------------------------------------------*/
6570

    
6571
int float128_le( float128 a, float128 b STATUS_PARAM )
6572
{
6573
    flag aSign, bSign;
6574

    
6575
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6576
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6577
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6578
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6579
       ) {
6580
        float_raise( float_flag_invalid STATUS_VAR);
6581
        return 0;
6582
    }
6583
    aSign = extractFloat128Sign( a );
6584
    bSign = extractFloat128Sign( b );
6585
    if ( aSign != bSign ) {
6586
        return
6587
               aSign
6588
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6589
                 == 0 );
6590
    }
6591
    return
6592
          aSign ? le128( b.high, b.low, a.high, a.low )
6593
        : le128( a.high, a.low, b.high, b.low );
6594

    
6595
}
6596

    
6597
/*----------------------------------------------------------------------------
6598
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6599
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6600
| raised if either operand is a NaN.  The comparison is performed according
6601
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6602
*----------------------------------------------------------------------------*/
6603

    
6604
int float128_lt( float128 a, float128 b STATUS_PARAM )
6605
{
6606
    flag aSign, bSign;
6607

    
6608
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6609
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6610
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6611
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6612
       ) {
6613
        float_raise( float_flag_invalid STATUS_VAR);
6614
        return 0;
6615
    }
6616
    aSign = extractFloat128Sign( a );
6617
    bSign = extractFloat128Sign( b );
6618
    if ( aSign != bSign ) {
6619
        return
6620
               aSign
6621
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6622
                 != 0 );
6623
    }
6624
    return
6625
          aSign ? lt128( b.high, b.low, a.high, a.low )
6626
        : lt128( a.high, a.low, b.high, b.low );
6627

    
6628
}
6629

    
6630
/*----------------------------------------------------------------------------
6631
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6632
| be compared, and 0 otherwise.  The invalid exception is raised if either
6633
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6634
| Standard for Binary Floating-Point Arithmetic.
6635
*----------------------------------------------------------------------------*/
6636

    
6637
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6638
{
6639
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6640
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6641
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6642
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6643
       ) {
6644
        float_raise( float_flag_invalid STATUS_VAR);
6645
        return 1;
6646
    }
6647
    return 0;
6648
}
6649

    
6650
/*----------------------------------------------------------------------------
6651
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6652
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6653
| exception.  The comparison is performed according to the IEC/IEEE Standard
6654
| for Binary Floating-Point Arithmetic.
6655
*----------------------------------------------------------------------------*/
6656

    
6657
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6658
{
6659

    
6660
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6661
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6662
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6663
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6664
       ) {
6665
        if (    float128_is_signaling_nan( a )
6666
             || float128_is_signaling_nan( b ) ) {
6667
            float_raise( float_flag_invalid STATUS_VAR);
6668
        }
6669
        return 0;
6670
    }
6671
    return
6672
           ( a.low == b.low )
6673
        && (    ( a.high == b.high )
6674
             || (    ( a.low == 0 )
6675
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6676
           );
6677

    
6678
}
6679

    
6680
/*----------------------------------------------------------------------------
6681
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6682
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6683
| cause an exception.  Otherwise, the comparison is performed according to the
6684
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6685
*----------------------------------------------------------------------------*/
6686

    
6687
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6688
{
6689
    flag aSign, bSign;
6690

    
6691
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6692
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6693
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6694
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6695
       ) {
6696
        if (    float128_is_signaling_nan( a )
6697
             || float128_is_signaling_nan( b ) ) {
6698
            float_raise( float_flag_invalid STATUS_VAR);
6699
        }
6700
        return 0;
6701
    }
6702
    aSign = extractFloat128Sign( a );
6703
    bSign = extractFloat128Sign( b );
6704
    if ( aSign != bSign ) {
6705
        return
6706
               aSign
6707
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6708
                 == 0 );
6709
    }
6710
    return
6711
          aSign ? le128( b.high, b.low, a.high, a.low )
6712
        : le128( a.high, a.low, b.high, b.low );
6713

    
6714
}
6715

    
6716
/*----------------------------------------------------------------------------
6717
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6718
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6719
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6720
| Standard for Binary Floating-Point Arithmetic.
6721
*----------------------------------------------------------------------------*/
6722

    
6723
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6724
{
6725
    flag aSign, bSign;
6726

    
6727
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6728
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6729
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6730
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6731
       ) {
6732
        if (    float128_is_signaling_nan( a )
6733
             || float128_is_signaling_nan( b ) ) {
6734
            float_raise( float_flag_invalid STATUS_VAR);
6735
        }
6736
        return 0;
6737
    }
6738
    aSign = extractFloat128Sign( a );
6739
    bSign = extractFloat128Sign( b );
6740
    if ( aSign != bSign ) {
6741
        return
6742
               aSign
6743
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6744
                 != 0 );
6745
    }
6746
    return
6747
          aSign ? lt128( b.high, b.low, a.high, a.low )
6748
        : lt128( a.high, a.low, b.high, b.low );
6749

    
6750
}
6751

    
6752
/*----------------------------------------------------------------------------
6753
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6754
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6755
| comparison is performed according to the IEC/IEEE Standard for Binary
6756
| Floating-Point Arithmetic.
6757
*----------------------------------------------------------------------------*/
6758

    
6759
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6760
{
6761
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6762
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6763
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6764
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6765
       ) {
6766
        if (    float128_is_signaling_nan( a )
6767
             || float128_is_signaling_nan( b ) ) {
6768
            float_raise( float_flag_invalid STATUS_VAR);
6769
        }
6770
        return 1;
6771
    }
6772
    return 0;
6773
}
6774

    
6775
/* misc functions */
6776
float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6777
{
6778
    return int64_to_float32(a STATUS_VAR);
6779
}
6780

    
6781
float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6782
{
6783
    return int64_to_float64(a STATUS_VAR);
6784
}
6785

    
6786
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6787
{
6788
    int64_t v;
6789
    uint32 res;
6790
    int old_exc_flags = get_float_exception_flags(status);
6791

    
6792
    v = float32_to_int64(a STATUS_VAR);
6793
    if (v < 0) {
6794
        res = 0;
6795
    } else if (v > 0xffffffff) {
6796
        res = 0xffffffff;
6797
    } else {
6798
        return v;
6799
    }
6800
    set_float_exception_flags(old_exc_flags, status);
6801
    float_raise(float_flag_invalid STATUS_VAR);
6802
    return res;
6803
}
6804

    
6805
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6806
{
6807
    int64_t v;
6808
    uint32 res;
6809
    int old_exc_flags = get_float_exception_flags(status);
6810

    
6811
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6812
    if (v < 0) {
6813
        res = 0;
6814
    } else if (v > 0xffffffff) {
6815
        res = 0xffffffff;
6816
    } else {
6817
        return v;
6818
    }
6819
    set_float_exception_flags(old_exc_flags, status);
6820
    float_raise(float_flag_invalid STATUS_VAR);
6821
    return res;
6822
}
6823

    
6824
int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6825
{
6826
    int32_t v;
6827
    int_fast16_t res;
6828
    int old_exc_flags = get_float_exception_flags(status);
6829

    
6830
    v = float32_to_int32(a STATUS_VAR);
6831
    if (v < -0x8000) {
6832
        res = -0x8000;
6833
    } else if (v > 0x7fff) {
6834
        res = 0x7fff;
6835
    } else {
6836
        return v;
6837
    }
6838

    
6839
    set_float_exception_flags(old_exc_flags, status);
6840
    float_raise(float_flag_invalid STATUS_VAR);
6841
    return res;
6842
}
6843

    
6844
uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6845
{
6846
    int32_t v;
6847
    uint_fast16_t res;
6848
    int old_exc_flags = get_float_exception_flags(status);
6849

    
6850
    v = float32_to_int32(a STATUS_VAR);
6851
    if (v < 0) {
6852
        res = 0;
6853
    } else if (v > 0xffff) {
6854
        res = 0xffff;
6855
    } else {
6856
        return v;
6857
    }
6858

    
6859
    set_float_exception_flags(old_exc_flags, status);
6860
    float_raise(float_flag_invalid STATUS_VAR);
6861
    return res;
6862
}
6863

    
6864
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6865
{
6866
    int64_t v;
6867
    uint_fast16_t res;
6868
    int old_exc_flags = get_float_exception_flags(status);
6869

    
6870
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6871
    if (v < 0) {
6872
        res = 0;
6873
    } else if (v > 0xffff) {
6874
        res = 0xffff;
6875
    } else {
6876
        return v;
6877
    }
6878
    set_float_exception_flags(old_exc_flags, status);
6879
    float_raise(float_flag_invalid STATUS_VAR);
6880
    return res;
6881
}
6882

    
6883
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6884
{
6885
    uint64_t v;
6886
    uint32 res;
6887
    int old_exc_flags = get_float_exception_flags(status);
6888

    
6889
    v = float64_to_uint64(a STATUS_VAR);
6890
    if (v > 0xffffffff) {
6891
        res = 0xffffffff;
6892
    } else {
6893
        return v;
6894
    }
6895
    set_float_exception_flags(old_exc_flags, status);
6896
    float_raise(float_flag_invalid STATUS_VAR);
6897
    return res;
6898
}
6899

    
6900
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6901
{
6902
    uint64_t v;
6903
    uint32 res;
6904
    int old_exc_flags = get_float_exception_flags(status);
6905

    
6906
    v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6907
    if (v > 0xffffffff) {
6908
        res = 0xffffffff;
6909
    } else {
6910
        return v;
6911
    }
6912
    set_float_exception_flags(old_exc_flags, status);
6913
    float_raise(float_flag_invalid STATUS_VAR);
6914
    return res;
6915
}
6916

    
6917
int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6918
{
6919
    int64_t v;
6920
    int_fast16_t res;
6921
    int old_exc_flags = get_float_exception_flags(status);
6922

    
6923
    v = float64_to_int32(a STATUS_VAR);
6924
    if (v < -0x8000) {
6925
        res = -0x8000;
6926
    } else if (v > 0x7fff) {
6927
        res = 0x7fff;
6928
    } else {
6929
        return v;
6930
    }
6931

    
6932
    set_float_exception_flags(old_exc_flags, status);
6933
    float_raise(float_flag_invalid STATUS_VAR);
6934
    return res;
6935
}
6936

    
6937
uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6938
{
6939
    int64_t v;
6940
    uint_fast16_t res;
6941
    int old_exc_flags = get_float_exception_flags(status);
6942

    
6943
    v = float64_to_int32(a STATUS_VAR);
6944
    if (v < 0) {
6945
        res = 0;
6946
    } else if (v > 0xffff) {
6947
        res = 0xffff;
6948
    } else {
6949
        return v;
6950
    }
6951

    
6952
    set_float_exception_flags(old_exc_flags, status);
6953
    float_raise(float_flag_invalid STATUS_VAR);
6954
    return res;
6955
}
6956

    
6957
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6958
{
6959
    int64_t v;
6960
    uint_fast16_t res;
6961
    int old_exc_flags = get_float_exception_flags(status);
6962

    
6963
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6964
    if (v < 0) {
6965
        res = 0;
6966
    } else if (v > 0xffff) {
6967
        res = 0xffff;
6968
    } else {
6969
        return v;
6970
    }
6971
    set_float_exception_flags(old_exc_flags, status);
6972
    float_raise(float_flag_invalid STATUS_VAR);
6973
    return res;
6974
}
6975

    
6976
/*----------------------------------------------------------------------------
6977
| Returns the result of converting the double-precision floating-point value
6978
| `a' to the 64-bit unsigned integer format.  The conversion is
6979
| performed according to the IEC/IEEE Standard for Binary Floating-Point
6980
| Arithmetic---which means in particular that the conversion is rounded
6981
| according to the current rounding mode.  If `a' is a NaN, the largest
6982
| positive integer is returned.  If the conversion overflows, the
6983
| largest unsigned integer is returned.  If 'a' is negative, the value is
6984
| rounded and zero is returned; negative values that do not round to zero
6985
| will raise the inexact exception.
6986
*----------------------------------------------------------------------------*/
6987

    
6988
uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6989
{
6990
    flag aSign;
6991
    int_fast16_t aExp, shiftCount;
6992
    uint64_t aSig, aSigExtra;
6993
    a = float64_squash_input_denormal(a STATUS_VAR);
6994

    
6995
    aSig = extractFloat64Frac(a);
6996
    aExp = extractFloat64Exp(a);
6997
    aSign = extractFloat64Sign(a);
6998
    if (aSign && (aExp > 1022)) {
6999
        float_raise(float_flag_invalid STATUS_VAR);
7000
        if (float64_is_any_nan(a)) {
7001
            return LIT64(0xFFFFFFFFFFFFFFFF);
7002
        } else {
7003
            return 0;
7004
        }
7005
    }
7006
    if (aExp) {
7007
        aSig |= LIT64(0x0010000000000000);
7008
    }
7009
    shiftCount = 0x433 - aExp;
7010
    if (shiftCount <= 0) {
7011
        if (0x43E < aExp) {
7012
            float_raise(float_flag_invalid STATUS_VAR);
7013
            return LIT64(0xFFFFFFFFFFFFFFFF);
7014
        }
7015
        aSigExtra = 0;
7016
        aSig <<= -shiftCount;
7017
    } else {
7018
        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7019
    }
7020
    return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7021
}
7022

    
7023
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7024
{
7025
    signed char current_rounding_mode = STATUS(float_rounding_mode);
7026
    set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7027
    int64_t v = float64_to_uint64(a STATUS_VAR);
7028
    set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7029
    return v;
7030
}
7031

    
7032
#define COMPARE(s, nan_exp)                                                  \
7033
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
7034
                                      int is_quiet STATUS_PARAM )            \
7035
{                                                                            \
7036
    flag aSign, bSign;                                                       \
7037
    uint ## s ## _t av, bv;                                                  \
7038
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
7039
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
7040
                                                                             \
7041
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7042
         extractFloat ## s ## Frac( a ) ) ||                                 \
7043
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7044
          extractFloat ## s ## Frac( b ) )) {                                \
7045
        if (!is_quiet ||                                                     \
7046
            float ## s ## _is_signaling_nan( a ) ||                          \
7047
            float ## s ## _is_signaling_nan( b ) ) {                         \
7048
            float_raise( float_flag_invalid STATUS_VAR);                     \
7049
        }                                                                    \
7050
        return float_relation_unordered;                                     \
7051
    }                                                                        \
7052
    aSign = extractFloat ## s ## Sign( a );                                  \
7053
    bSign = extractFloat ## s ## Sign( b );                                  \
7054
    av = float ## s ## _val(a);                                              \
7055
    bv = float ## s ## _val(b);                                              \
7056
    if ( aSign != bSign ) {                                                  \
7057
        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7058
            /* zero case */                                                  \
7059
            return float_relation_equal;                                     \
7060
        } else {                                                             \
7061
            return 1 - (2 * aSign);                                          \
7062
        }                                                                    \
7063
    } else {                                                                 \
7064
        if (av == bv) {                                                      \
7065
            return float_relation_equal;                                     \
7066
        } else {                                                             \
7067
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
7068
        }                                                                    \
7069
    }                                                                        \
7070
}                                                                            \
7071
                                                                             \
7072
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
7073
{                                                                            \
7074
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
7075
}                                                                            \
7076
                                                                             \
7077
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
7078
{                                                                            \
7079
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
7080
}
7081

    
7082
COMPARE(32, 0xff)
7083
COMPARE(64, 0x7ff)
7084

    
7085
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
7086
                                      int is_quiet STATUS_PARAM )
7087
{
7088
    flag aSign, bSign;
7089

    
7090
    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7091
          ( extractFloatx80Frac( a )<<1 ) ) ||
7092
        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7093
          ( extractFloatx80Frac( b )<<1 ) )) {
7094
        if (!is_quiet ||
7095
            floatx80_is_signaling_nan( a ) ||
7096
            floatx80_is_signaling_nan( b ) ) {
7097
            float_raise( float_flag_invalid STATUS_VAR);
7098
        }
7099
        return float_relation_unordered;
7100
    }
7101
    aSign = extractFloatx80Sign( a );
7102
    bSign = extractFloatx80Sign( b );
7103
    if ( aSign != bSign ) {
7104

    
7105
        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7106
             ( ( a.low | b.low ) == 0 ) ) {
7107
            /* zero case */
7108
            return float_relation_equal;
7109
        } else {
7110
            return 1 - (2 * aSign);
7111
        }
7112
    } else {
7113
        if (a.low == b.low && a.high == b.high) {
7114
            return float_relation_equal;
7115
        } else {
7116
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7117
        }
7118
    }
7119
}
7120

    
7121
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7122
{
7123
    return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7124
}
7125

    
7126
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7127
{
7128
    return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7129
}
7130

    
7131
INLINE int float128_compare_internal( float128 a, float128 b,
7132
                                      int is_quiet STATUS_PARAM )
7133
{
7134
    flag aSign, bSign;
7135

    
7136
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7137
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7138
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7139
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7140
        if (!is_quiet ||
7141
            float128_is_signaling_nan( a ) ||
7142
            float128_is_signaling_nan( b ) ) {
7143
            float_raise( float_flag_invalid STATUS_VAR);
7144
        }
7145
        return float_relation_unordered;
7146
    }
7147
    aSign = extractFloat128Sign( a );
7148
    bSign = extractFloat128Sign( b );
7149
    if ( aSign != bSign ) {
7150
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7151
            /* zero case */
7152
            return float_relation_equal;
7153
        } else {
7154
            return 1 - (2 * aSign);
7155
        }
7156
    } else {
7157
        if (a.low == b.low && a.high == b.high) {
7158
            return float_relation_equal;
7159
        } else {
7160
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7161
        }
7162
    }
7163
}
7164

    
7165
int float128_compare( float128 a, float128 b STATUS_PARAM )
7166
{
7167
    return float128_compare_internal(a, b, 0 STATUS_VAR);
7168
}
7169

    
7170
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7171
{
7172
    return float128_compare_internal(a, b, 1 STATUS_VAR);
7173
}
7174

    
7175
/* min() and max() functions. These can't be implemented as
7176
 * 'compare and pick one input' because that would mishandle
7177
 * NaNs and +0 vs -0.
7178
 *
7179
 * minnum() and maxnum() functions. These are similar to the min()
7180
 * and max() functions but if one of the arguments is a QNaN and
7181
 * the other is numerical then the numerical argument is returned.
7182
 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7183
 * and maxNum() operations. min() and max() are the typical min/max
7184
 * semantics provided by many CPUs which predate that specification.
7185
 */
7186
#define MINMAX(s)                                                       \
7187
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7188
                                        int ismin, int isieee STATUS_PARAM) \
7189
{                                                                       \
7190
    flag aSign, bSign;                                                  \
7191
    uint ## s ## _t av, bv;                                             \
7192
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7193
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7194
    if (float ## s ## _is_any_nan(a) ||                                 \
7195
        float ## s ## _is_any_nan(b)) {                                 \
7196
        if (isieee) {                                                   \
7197
            if (float ## s ## _is_quiet_nan(a) &&                       \
7198
                !float ## s ##_is_any_nan(b)) {                         \
7199
                return b;                                               \
7200
            } else if (float ## s ## _is_quiet_nan(b) &&                \
7201
                       !float ## s ## _is_any_nan(a)) {                 \
7202
                return a;                                               \
7203
            }                                                           \
7204
        }                                                               \
7205
        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7206
    }                                                                   \
7207
    aSign = extractFloat ## s ## Sign(a);                               \
7208
    bSign = extractFloat ## s ## Sign(b);                               \
7209
    av = float ## s ## _val(a);                                         \
7210
    bv = float ## s ## _val(b);                                         \
7211
    if (aSign != bSign) {                                               \
7212
        if (ismin) {                                                    \
7213
            return aSign ? a : b;                                       \
7214
        } else {                                                        \
7215
            return aSign ? b : a;                                       \
7216
        }                                                               \
7217
    } else {                                                            \
7218
        if (ismin) {                                                    \
7219
            return (aSign ^ (av < bv)) ? a : b;                         \
7220
        } else {                                                        \
7221
            return (aSign ^ (av < bv)) ? b : a;                         \
7222
        }                                                               \
7223
    }                                                                   \
7224
}                                                                       \
7225
                                                                        \
7226
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
7227
{                                                                       \
7228
    return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
7229
}                                                                       \
7230
                                                                        \
7231
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7232
{                                                                       \
7233
    return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
7234
}                                                                       \
7235
                                                                        \
7236
float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7237
{                                                                       \
7238
    return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
7239
}                                                                       \
7240
                                                                        \
7241
float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7242
{                                                                       \
7243
    return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
7244
}
7245

    
7246
MINMAX(32)
7247
MINMAX(64)
7248

    
7249

    
7250
/* Multiply A by 2 raised to the power N.  */
7251
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7252
{
7253
    flag aSign;
7254
    int16_t aExp;
7255
    uint32_t aSig;
7256

    
7257
    a = float32_squash_input_denormal(a STATUS_VAR);
7258
    aSig = extractFloat32Frac( a );
7259
    aExp = extractFloat32Exp( a );
7260
    aSign = extractFloat32Sign( a );
7261

    
7262
    if ( aExp == 0xFF ) {
7263
        if ( aSig ) {
7264
            return propagateFloat32NaN( a, a STATUS_VAR );
7265
        }
7266
        return a;
7267
    }
7268
    if (aExp != 0) {
7269
        aSig |= 0x00800000;
7270
    } else if (aSig == 0) {
7271
        return a;
7272
    } else {
7273
        aExp++;
7274
    }
7275

    
7276
    if (n > 0x200) {
7277
        n = 0x200;
7278
    } else if (n < -0x200) {
7279
        n = -0x200;
7280
    }
7281

    
7282
    aExp += n - 1;
7283
    aSig <<= 7;
7284
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7285
}
7286

    
7287
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7288
{
7289
    flag aSign;
7290
    int16_t aExp;
7291
    uint64_t aSig;
7292

    
7293
    a = float64_squash_input_denormal(a STATUS_VAR);
7294
    aSig = extractFloat64Frac( a );
7295
    aExp = extractFloat64Exp( a );
7296
    aSign = extractFloat64Sign( a );
7297

    
7298
    if ( aExp == 0x7FF ) {
7299
        if ( aSig ) {
7300
            return propagateFloat64NaN( a, a STATUS_VAR );
7301
        }
7302
        return a;
7303
    }
7304
    if (aExp != 0) {
7305
        aSig |= LIT64( 0x0010000000000000 );
7306
    } else if (aSig == 0) {
7307
        return a;
7308
    } else {
7309
        aExp++;
7310
    }
7311

    
7312
    if (n > 0x1000) {
7313
        n = 0x1000;
7314
    } else if (n < -0x1000) {
7315
        n = -0x1000;
7316
    }
7317

    
7318
    aExp += n - 1;
7319
    aSig <<= 10;
7320
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7321
}
7322

    
7323
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7324
{
7325
    flag aSign;
7326
    int32_t aExp;
7327
    uint64_t aSig;
7328

    
7329
    aSig = extractFloatx80Frac( a );
7330
    aExp = extractFloatx80Exp( a );
7331
    aSign = extractFloatx80Sign( a );
7332

    
7333
    if ( aExp == 0x7FFF ) {
7334
        if ( aSig<<1 ) {
7335
            return propagateFloatx80NaN( a, a STATUS_VAR );
7336
        }
7337
        return a;
7338
    }
7339

    
7340
    if (aExp == 0) {
7341
        if (aSig == 0) {
7342
            return a;
7343
        }
7344
        aExp++;
7345
    }
7346

    
7347
    if (n > 0x10000) {
7348
        n = 0x10000;
7349
    } else if (n < -0x10000) {
7350
        n = -0x10000;
7351
    }
7352

    
7353
    aExp += n;
7354
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7355
                                          aSign, aExp, aSig, 0 STATUS_VAR );
7356
}
7357

    
7358
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7359
{
7360
    flag aSign;
7361
    int32_t aExp;
7362
    uint64_t aSig0, aSig1;
7363

    
7364
    aSig1 = extractFloat128Frac1( a );
7365
    aSig0 = extractFloat128Frac0( a );
7366
    aExp = extractFloat128Exp( a );
7367
    aSign = extractFloat128Sign( a );
7368
    if ( aExp == 0x7FFF ) {
7369
        if ( aSig0 | aSig1 ) {
7370
            return propagateFloat128NaN( a, a STATUS_VAR );
7371
        }
7372
        return a;
7373
    }
7374
    if (aExp != 0) {
7375
        aSig0 |= LIT64( 0x0001000000000000 );
7376
    } else if (aSig0 == 0 && aSig1 == 0) {
7377
        return a;
7378
    } else {
7379
        aExp++;
7380
    }
7381

    
7382
    if (n > 0x10000) {
7383
        n = 0x10000;
7384
    } else if (n < -0x10000) {
7385
        n = -0x10000;
7386
    }
7387

    
7388
    aExp += n - 1;
7389
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7390
                                          STATUS_VAR );
7391

    
7392
}