Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ bb98fe42

History | View | Annotate | Download (215.3 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
#include "softfloat.h"
39

    
40
/*----------------------------------------------------------------------------
41
| Primitive arithmetic functions, including multi-word arithmetic, and
42
| division and square root approximations.  (Can be specialized to target if
43
| desired.)
44
*----------------------------------------------------------------------------*/
45
#include "softfloat-macros.h"
46

    
47
/*----------------------------------------------------------------------------
48
| Functions and definitions to determine:  (1) whether tininess for underflow
49
| is detected before or after rounding by default, (2) what (if anything)
50
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
51
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52
| are propagated from function inputs to output.  These details are target-
53
| specific.
54
*----------------------------------------------------------------------------*/
55
#include "softfloat-specialize.h"
56

    
57
void set_float_rounding_mode(int val STATUS_PARAM)
58
{
59
    STATUS(float_rounding_mode) = val;
60
}
61

    
62
void set_float_exception_flags(int val STATUS_PARAM)
63
{
64
    STATUS(float_exception_flags) = val;
65
}
66

    
67
#ifdef FLOATX80
68
void set_floatx80_rounding_precision(int val STATUS_PARAM)
69
{
70
    STATUS(floatx80_rounding_precision) = val;
71
}
72
#endif
73

    
74
/*----------------------------------------------------------------------------
75
| Returns the fraction bits of the half-precision floating-point value `a'.
76
*----------------------------------------------------------------------------*/
77

    
78
INLINE uint32_t extractFloat16Frac(float16 a)
79
{
80
    return float16_val(a) & 0x3ff;
81
}
82

    
83
/*----------------------------------------------------------------------------
84
| Returns the exponent bits of the half-precision floating-point value `a'.
85
*----------------------------------------------------------------------------*/
86

    
87
INLINE int16 extractFloat16Exp(float16 a)
88
{
89
    return (float16_val(a) >> 10) & 0x1f;
90
}
91

    
92
/*----------------------------------------------------------------------------
93
| Returns the sign bit of the single-precision floating-point value `a'.
94
*----------------------------------------------------------------------------*/
95

    
96
INLINE flag extractFloat16Sign(float16 a)
97
{
98
    return float16_val(a)>>15;
99
}
100

    
101
/*----------------------------------------------------------------------------
102
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103
| and 7, and returns the properly rounded 32-bit integer corresponding to the
104
| input.  If `zSign' is 1, the input is negated before being converted to an
105
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
106
| is simply rounded to an integer, with the inexact exception raised if the
107
| input cannot be represented exactly as an integer.  However, if the fixed-
108
| point input is too large, the invalid exception is raised and the largest
109
| positive or negative integer is returned.
110
*----------------------------------------------------------------------------*/
111

    
112
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
113
{
114
    int8 roundingMode;
115
    flag roundNearestEven;
116
    int8 roundIncrement, roundBits;
117
    int32 z;
118

    
119
    roundingMode = STATUS(float_rounding_mode);
120
    roundNearestEven = ( roundingMode == float_round_nearest_even );
121
    roundIncrement = 0x40;
122
    if ( ! roundNearestEven ) {
123
        if ( roundingMode == float_round_to_zero ) {
124
            roundIncrement = 0;
125
        }
126
        else {
127
            roundIncrement = 0x7F;
128
            if ( zSign ) {
129
                if ( roundingMode == float_round_up ) roundIncrement = 0;
130
            }
131
            else {
132
                if ( roundingMode == float_round_down ) roundIncrement = 0;
133
            }
134
        }
135
    }
136
    roundBits = absZ & 0x7F;
137
    absZ = ( absZ + roundIncrement )>>7;
138
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139
    z = absZ;
140
    if ( zSign ) z = - z;
141
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142
        float_raise( float_flag_invalid STATUS_VAR);
143
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
144
    }
145
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146
    return z;
147

    
148
}
149

    
150
/*----------------------------------------------------------------------------
151
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152
| `absZ1', with binary point between bits 63 and 64 (between the input words),
153
| and returns the properly rounded 64-bit integer corresponding to the input.
154
| If `zSign' is 1, the input is negated before being converted to an integer.
155
| Ordinarily, the fixed-point input is simply rounded to an integer, with
156
| the inexact exception raised if the input cannot be represented exactly as
157
| an integer.  However, if the fixed-point input is too large, the invalid
158
| exception is raised and the largest positive or negative integer is
159
| returned.
160
*----------------------------------------------------------------------------*/
161

    
162
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
163
{
164
    int8 roundingMode;
165
    flag roundNearestEven, increment;
166
    int64 z;
167

    
168
    roundingMode = STATUS(float_rounding_mode);
169
    roundNearestEven = ( roundingMode == float_round_nearest_even );
170
    increment = ( (int64_t) absZ1 < 0 );
171
    if ( ! roundNearestEven ) {
172
        if ( roundingMode == float_round_to_zero ) {
173
            increment = 0;
174
        }
175
        else {
176
            if ( zSign ) {
177
                increment = ( roundingMode == float_round_down ) && absZ1;
178
            }
179
            else {
180
                increment = ( roundingMode == float_round_up ) && absZ1;
181
            }
182
        }
183
    }
184
    if ( increment ) {
185
        ++absZ0;
186
        if ( absZ0 == 0 ) goto overflow;
187
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188
    }
189
    z = absZ0;
190
    if ( zSign ) z = - z;
191
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
192
 overflow:
193
        float_raise( float_flag_invalid STATUS_VAR);
194
        return
195
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
196
            : LIT64( 0x7FFFFFFFFFFFFFFF );
197
    }
198
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199
    return z;
200

    
201
}
202

    
203
/*----------------------------------------------------------------------------
204
| Returns the fraction bits of the single-precision floating-point value `a'.
205
*----------------------------------------------------------------------------*/
206

    
207
INLINE uint32_t extractFloat32Frac( float32 a )
208
{
209

    
210
    return float32_val(a) & 0x007FFFFF;
211

    
212
}
213

    
214
/*----------------------------------------------------------------------------
215
| Returns the exponent bits of the single-precision floating-point value `a'.
216
*----------------------------------------------------------------------------*/
217

    
218
INLINE int16 extractFloat32Exp( float32 a )
219
{
220

    
221
    return ( float32_val(a)>>23 ) & 0xFF;
222

    
223
}
224

    
225
/*----------------------------------------------------------------------------
226
| Returns the sign bit of the single-precision floating-point value `a'.
227
*----------------------------------------------------------------------------*/
228

    
229
INLINE flag extractFloat32Sign( float32 a )
230
{
231

    
232
    return float32_val(a)>>31;
233

    
234
}
235

    
236
/*----------------------------------------------------------------------------
237
| If `a' is denormal and we are in flush-to-zero mode then set the
238
| input-denormal exception and return zero. Otherwise just return the value.
239
*----------------------------------------------------------------------------*/
240
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241
{
242
    if (STATUS(flush_inputs_to_zero)) {
243
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244
            float_raise(float_flag_input_denormal STATUS_VAR);
245
            return make_float32(float32_val(a) & 0x80000000);
246
        }
247
    }
248
    return a;
249
}
250

    
251
/*----------------------------------------------------------------------------
252
| Normalizes the subnormal single-precision floating-point value represented
253
| by the denormalized significand `aSig'.  The normalized exponent and
254
| significand are stored at the locations pointed to by `zExpPtr' and
255
| `zSigPtr', respectively.
256
*----------------------------------------------------------------------------*/
257

    
258
static void
259
 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
260
{
261
    int8 shiftCount;
262

    
263
    shiftCount = countLeadingZeros32( aSig ) - 8;
264
    *zSigPtr = aSig<<shiftCount;
265
    *zExpPtr = 1 - shiftCount;
266

    
267
}
268

    
269
/*----------------------------------------------------------------------------
270
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271
| single-precision floating-point value, returning the result.  After being
272
| shifted into the proper positions, the three fields are simply added
273
| together to form the result.  This means that any integer portion of `zSig'
274
| will be added into the exponent.  Since a properly normalized significand
275
| will have an integer portion equal to 1, the `zExp' input should be 1 less
276
| than the desired result exponent whenever `zSig' is a complete, normalized
277
| significand.
278
*----------------------------------------------------------------------------*/
279

    
280
INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281
{
282

    
283
    return make_float32(
284
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
285

    
286
}
287

    
288
/*----------------------------------------------------------------------------
289
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290
| and significand `zSig', and returns the proper single-precision floating-
291
| point value corresponding to the abstract input.  Ordinarily, the abstract
292
| value is simply rounded and packed into the single-precision format, with
293
| the inexact exception raised if the abstract input cannot be represented
294
| exactly.  However, if the abstract value is too large, the overflow and
295
| inexact exceptions are raised and an infinity or maximal finite value is
296
| returned.  If the abstract value is too small, the input value is rounded to
297
| a subnormal number, and the underflow and inexact exceptions are raised if
298
| the abstract input cannot be represented exactly as a subnormal single-
299
| precision floating-point number.
300
|     The input significand `zSig' has its binary point between bits 30
301
| and 29, which is 7 bits to the left of the usual location.  This shifted
302
| significand must be normalized or smaller.  If `zSig' is not normalized,
303
| `zExp' must be 0; in that case, the result returned is a subnormal number,
304
| and it must not require rounding.  In the usual case that `zSig' is
305
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306
| The handling of underflow and overflow follows the IEC/IEEE Standard for
307
| Binary Floating-Point Arithmetic.
308
*----------------------------------------------------------------------------*/
309

    
310
static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
311
{
312
    int8 roundingMode;
313
    flag roundNearestEven;
314
    int8 roundIncrement, roundBits;
315
    flag isTiny;
316

    
317
    roundingMode = STATUS(float_rounding_mode);
318
    roundNearestEven = ( roundingMode == float_round_nearest_even );
319
    roundIncrement = 0x40;
320
    if ( ! roundNearestEven ) {
321
        if ( roundingMode == float_round_to_zero ) {
322
            roundIncrement = 0;
323
        }
324
        else {
325
            roundIncrement = 0x7F;
326
            if ( zSign ) {
327
                if ( roundingMode == float_round_up ) roundIncrement = 0;
328
            }
329
            else {
330
                if ( roundingMode == float_round_down ) roundIncrement = 0;
331
            }
332
        }
333
    }
334
    roundBits = zSig & 0x7F;
335
    if ( 0xFD <= (uint16_t) zExp ) {
336
        if (    ( 0xFD < zExp )
337
             || (    ( zExp == 0xFD )
338
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
339
           ) {
340
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
342
        }
343
        if ( zExp < 0 ) {
344
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
345
            isTiny =
346
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347
                || ( zExp < -1 )
348
                || ( zSig + roundIncrement < 0x80000000 );
349
            shift32RightJamming( zSig, - zExp, &zSig );
350
            zExp = 0;
351
            roundBits = zSig & 0x7F;
352
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
353
        }
354
    }
355
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356
    zSig = ( zSig + roundIncrement )>>7;
357
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358
    if ( zSig == 0 ) zExp = 0;
359
    return packFloat32( zSign, zExp, zSig );
360

    
361
}
362

    
363
/*----------------------------------------------------------------------------
364
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365
| and significand `zSig', and returns the proper single-precision floating-
366
| point value corresponding to the abstract input.  This routine is just like
367
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369
| floating-point exponent.
370
*----------------------------------------------------------------------------*/
371

    
372
static float32
373
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
374
{
375
    int8 shiftCount;
376

    
377
    shiftCount = countLeadingZeros32( zSig ) - 1;
378
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
379

    
380
}
381

    
382
/*----------------------------------------------------------------------------
383
| Returns the fraction bits of the double-precision floating-point value `a'.
384
*----------------------------------------------------------------------------*/
385

    
386
INLINE uint64_t extractFloat64Frac( float64 a )
387
{
388

    
389
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
390

    
391
}
392

    
393
/*----------------------------------------------------------------------------
394
| Returns the exponent bits of the double-precision floating-point value `a'.
395
*----------------------------------------------------------------------------*/
396

    
397
INLINE int16 extractFloat64Exp( float64 a )
398
{
399

    
400
    return ( float64_val(a)>>52 ) & 0x7FF;
401

    
402
}
403

    
404
/*----------------------------------------------------------------------------
405
| Returns the sign bit of the double-precision floating-point value `a'.
406
*----------------------------------------------------------------------------*/
407

    
408
INLINE flag extractFloat64Sign( float64 a )
409
{
410

    
411
    return float64_val(a)>>63;
412

    
413
}
414

    
415
/*----------------------------------------------------------------------------
416
| If `a' is denormal and we are in flush-to-zero mode then set the
417
| input-denormal exception and return zero. Otherwise just return the value.
418
*----------------------------------------------------------------------------*/
419
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
420
{
421
    if (STATUS(flush_inputs_to_zero)) {
422
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423
            float_raise(float_flag_input_denormal STATUS_VAR);
424
            return make_float64(float64_val(a) & (1ULL << 63));
425
        }
426
    }
427
    return a;
428
}
429

    
430
/*----------------------------------------------------------------------------
431
| Normalizes the subnormal double-precision floating-point value represented
432
| by the denormalized significand `aSig'.  The normalized exponent and
433
| significand are stored at the locations pointed to by `zExpPtr' and
434
| `zSigPtr', respectively.
435
*----------------------------------------------------------------------------*/
436

    
437
static void
438
 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
439
{
440
    int8 shiftCount;
441

    
442
    shiftCount = countLeadingZeros64( aSig ) - 11;
443
    *zSigPtr = aSig<<shiftCount;
444
    *zExpPtr = 1 - shiftCount;
445

    
446
}
447

    
448
/*----------------------------------------------------------------------------
449
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450
| double-precision floating-point value, returning the result.  After being
451
| shifted into the proper positions, the three fields are simply added
452
| together to form the result.  This means that any integer portion of `zSig'
453
| will be added into the exponent.  Since a properly normalized significand
454
| will have an integer portion equal to 1, the `zExp' input should be 1 less
455
| than the desired result exponent whenever `zSig' is a complete, normalized
456
| significand.
457
*----------------------------------------------------------------------------*/
458

    
459
INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
460
{
461

    
462
    return make_float64(
463
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
464

    
465
}
466

    
467
/*----------------------------------------------------------------------------
468
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469
| and significand `zSig', and returns the proper double-precision floating-
470
| point value corresponding to the abstract input.  Ordinarily, the abstract
471
| value is simply rounded and packed into the double-precision format, with
472
| the inexact exception raised if the abstract input cannot be represented
473
| exactly.  However, if the abstract value is too large, the overflow and
474
| inexact exceptions are raised and an infinity or maximal finite value is
475
| returned.  If the abstract value is too small, the input value is rounded
476
| to a subnormal number, and the underflow and inexact exceptions are raised
477
| if the abstract input cannot be represented exactly as a subnormal double-
478
| precision floating-point number.
479
|     The input significand `zSig' has its binary point between bits 62
480
| and 61, which is 10 bits to the left of the usual location.  This shifted
481
| significand must be normalized or smaller.  If `zSig' is not normalized,
482
| `zExp' must be 0; in that case, the result returned is a subnormal number,
483
| and it must not require rounding.  In the usual case that `zSig' is
484
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485
| The handling of underflow and overflow follows the IEC/IEEE Standard for
486
| Binary Floating-Point Arithmetic.
487
*----------------------------------------------------------------------------*/
488

    
489
static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
490
{
491
    int8 roundingMode;
492
    flag roundNearestEven;
493
    int16 roundIncrement, roundBits;
494
    flag isTiny;
495

    
496
    roundingMode = STATUS(float_rounding_mode);
497
    roundNearestEven = ( roundingMode == float_round_nearest_even );
498
    roundIncrement = 0x200;
499
    if ( ! roundNearestEven ) {
500
        if ( roundingMode == float_round_to_zero ) {
501
            roundIncrement = 0;
502
        }
503
        else {
504
            roundIncrement = 0x3FF;
505
            if ( zSign ) {
506
                if ( roundingMode == float_round_up ) roundIncrement = 0;
507
            }
508
            else {
509
                if ( roundingMode == float_round_down ) roundIncrement = 0;
510
            }
511
        }
512
    }
513
    roundBits = zSig & 0x3FF;
514
    if ( 0x7FD <= (uint16_t) zExp ) {
515
        if (    ( 0x7FD < zExp )
516
             || (    ( zExp == 0x7FD )
517
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
518
           ) {
519
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
520
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
521
        }
522
        if ( zExp < 0 ) {
523
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
524
            isTiny =
525
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526
                || ( zExp < -1 )
527
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528
            shift64RightJamming( zSig, - zExp, &zSig );
529
            zExp = 0;
530
            roundBits = zSig & 0x3FF;
531
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
532
        }
533
    }
534
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535
    zSig = ( zSig + roundIncrement )>>10;
536
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537
    if ( zSig == 0 ) zExp = 0;
538
    return packFloat64( zSign, zExp, zSig );
539

    
540
}
541

    
542
/*----------------------------------------------------------------------------
543
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544
| and significand `zSig', and returns the proper double-precision floating-
545
| point value corresponding to the abstract input.  This routine is just like
546
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548
| floating-point exponent.
549
*----------------------------------------------------------------------------*/
550

    
551
static float64
552
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
553
{
554
    int8 shiftCount;
555

    
556
    shiftCount = countLeadingZeros64( zSig ) - 1;
557
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
558

    
559
}
560

    
561
#ifdef FLOATX80
562

    
563
/*----------------------------------------------------------------------------
564
| Returns the fraction bits of the extended double-precision floating-point
565
| value `a'.
566
*----------------------------------------------------------------------------*/
567

    
568
INLINE uint64_t extractFloatx80Frac( floatx80 a )
569
{
570

    
571
    return a.low;
572

    
573
}
574

    
575
/*----------------------------------------------------------------------------
576
| Returns the exponent bits of the extended double-precision floating-point
577
| value `a'.
578
*----------------------------------------------------------------------------*/
579

    
580
INLINE int32 extractFloatx80Exp( floatx80 a )
581
{
582

    
583
    return a.high & 0x7FFF;
584

    
585
}
586

    
587
/*----------------------------------------------------------------------------
588
| Returns the sign bit of the extended double-precision floating-point value
589
| `a'.
590
*----------------------------------------------------------------------------*/
591

    
592
INLINE flag extractFloatx80Sign( floatx80 a )
593
{
594

    
595
    return a.high>>15;
596

    
597
}
598

    
599
/*----------------------------------------------------------------------------
600
| Normalizes the subnormal extended double-precision floating-point value
601
| represented by the denormalized significand `aSig'.  The normalized exponent
602
| and significand are stored at the locations pointed to by `zExpPtr' and
603
| `zSigPtr', respectively.
604
*----------------------------------------------------------------------------*/
605

    
606
static void
607
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
608
{
609
    int8 shiftCount;
610

    
611
    shiftCount = countLeadingZeros64( aSig );
612
    *zSigPtr = aSig<<shiftCount;
613
    *zExpPtr = 1 - shiftCount;
614

    
615
}
616

    
617
/*----------------------------------------------------------------------------
618
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619
| extended double-precision floating-point value, returning the result.
620
*----------------------------------------------------------------------------*/
621

    
622
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
623
{
624
    floatx80 z;
625

    
626
    z.low = zSig;
627
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
628
    return z;
629

    
630
}
631

    
632
/*----------------------------------------------------------------------------
633
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
635
| and returns the proper extended double-precision floating-point value
636
| corresponding to the abstract input.  Ordinarily, the abstract value is
637
| rounded and packed into the extended double-precision format, with the
638
| inexact exception raised if the abstract input cannot be represented
639
| exactly.  However, if the abstract value is too large, the overflow and
640
| inexact exceptions are raised and an infinity or maximal finite value is
641
| returned.  If the abstract value is too small, the input value is rounded to
642
| a subnormal number, and the underflow and inexact exceptions are raised if
643
| the abstract input cannot be represented exactly as a subnormal extended
644
| double-precision floating-point number.
645
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
646
| number of bits as single or double precision, respectively.  Otherwise, the
647
| result is rounded to the full precision of the extended double-precision
648
| format.
649
|     The input significand must be normalized or smaller.  If the input
650
| significand is not normalized, `zExp' must be 0; in that case, the result
651
| returned is a subnormal number, and it must not require rounding.  The
652
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653
| Floating-Point Arithmetic.
654
*----------------------------------------------------------------------------*/
655

    
656
static floatx80
657
 roundAndPackFloatx80(
658
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
659
 STATUS_PARAM)
660
{
661
    int8 roundingMode;
662
    flag roundNearestEven, increment, isTiny;
663
    int64 roundIncrement, roundMask, roundBits;
664

    
665
    roundingMode = STATUS(float_rounding_mode);
666
    roundNearestEven = ( roundingMode == float_round_nearest_even );
667
    if ( roundingPrecision == 80 ) goto precision80;
668
    if ( roundingPrecision == 64 ) {
669
        roundIncrement = LIT64( 0x0000000000000400 );
670
        roundMask = LIT64( 0x00000000000007FF );
671
    }
672
    else if ( roundingPrecision == 32 ) {
673
        roundIncrement = LIT64( 0x0000008000000000 );
674
        roundMask = LIT64( 0x000000FFFFFFFFFF );
675
    }
676
    else {
677
        goto precision80;
678
    }
679
    zSig0 |= ( zSig1 != 0 );
680
    if ( ! roundNearestEven ) {
681
        if ( roundingMode == float_round_to_zero ) {
682
            roundIncrement = 0;
683
        }
684
        else {
685
            roundIncrement = roundMask;
686
            if ( zSign ) {
687
                if ( roundingMode == float_round_up ) roundIncrement = 0;
688
            }
689
            else {
690
                if ( roundingMode == float_round_down ) roundIncrement = 0;
691
            }
692
        }
693
    }
694
    roundBits = zSig0 & roundMask;
695
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
696
        if (    ( 0x7FFE < zExp )
697
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
698
           ) {
699
            goto overflow;
700
        }
701
        if ( zExp <= 0 ) {
702
            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
703
            isTiny =
704
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705
                || ( zExp < 0 )
706
                || ( zSig0 <= zSig0 + roundIncrement );
707
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708
            zExp = 0;
709
            roundBits = zSig0 & roundMask;
710
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712
            zSig0 += roundIncrement;
713
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
714
            roundIncrement = roundMask + 1;
715
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716
                roundMask |= roundIncrement;
717
            }
718
            zSig0 &= ~ roundMask;
719
            return packFloatx80( zSign, zExp, zSig0 );
720
        }
721
    }
722
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723
    zSig0 += roundIncrement;
724
    if ( zSig0 < roundIncrement ) {
725
        ++zExp;
726
        zSig0 = LIT64( 0x8000000000000000 );
727
    }
728
    roundIncrement = roundMask + 1;
729
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730
        roundMask |= roundIncrement;
731
    }
732
    zSig0 &= ~ roundMask;
733
    if ( zSig0 == 0 ) zExp = 0;
734
    return packFloatx80( zSign, zExp, zSig0 );
735
 precision80:
736
    increment = ( (int64_t) zSig1 < 0 );
737
    if ( ! roundNearestEven ) {
738
        if ( roundingMode == float_round_to_zero ) {
739
            increment = 0;
740
        }
741
        else {
742
            if ( zSign ) {
743
                increment = ( roundingMode == float_round_down ) && zSig1;
744
            }
745
            else {
746
                increment = ( roundingMode == float_round_up ) && zSig1;
747
            }
748
        }
749
    }
750
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
751
        if (    ( 0x7FFE < zExp )
752
             || (    ( zExp == 0x7FFE )
753
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754
                  && increment
755
                )
756
           ) {
757
            roundMask = 0;
758
 overflow:
759
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760
            if (    ( roundingMode == float_round_to_zero )
761
                 || ( zSign && ( roundingMode == float_round_up ) )
762
                 || ( ! zSign && ( roundingMode == float_round_down ) )
763
               ) {
764
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765
            }
766
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767
        }
768
        if ( zExp <= 0 ) {
769
            isTiny =
770
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771
                || ( zExp < 0 )
772
                || ! increment
773
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775
            zExp = 0;
776
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778
            if ( roundNearestEven ) {
779
                increment = ( (int64_t) zSig1 < 0 );
780
            }
781
            else {
782
                if ( zSign ) {
783
                    increment = ( roundingMode == float_round_down ) && zSig1;
784
                }
785
                else {
786
                    increment = ( roundingMode == float_round_up ) && zSig1;
787
                }
788
            }
789
            if ( increment ) {
790
                ++zSig0;
791
                zSig0 &=
792
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
794
            }
795
            return packFloatx80( zSign, zExp, zSig0 );
796
        }
797
    }
798
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799
    if ( increment ) {
800
        ++zSig0;
801
        if ( zSig0 == 0 ) {
802
            ++zExp;
803
            zSig0 = LIT64( 0x8000000000000000 );
804
        }
805
        else {
806
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
807
        }
808
    }
809
    else {
810
        if ( zSig0 == 0 ) zExp = 0;
811
    }
812
    return packFloatx80( zSign, zExp, zSig0 );
813

    
814
}
815

    
816
/*----------------------------------------------------------------------------
817
| Takes an abstract floating-point value having sign `zSign', exponent
818
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819
| and returns the proper extended double-precision floating-point value
820
| corresponding to the abstract input.  This routine is just like
821
| `roundAndPackFloatx80' except that the input significand does not have to be
822
| normalized.
823
*----------------------------------------------------------------------------*/
824

    
825
static floatx80
826
 normalizeRoundAndPackFloatx80(
827
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
828
 STATUS_PARAM)
829
{
830
    int8 shiftCount;
831

    
832
    if ( zSig0 == 0 ) {
833
        zSig0 = zSig1;
834
        zSig1 = 0;
835
        zExp -= 64;
836
    }
837
    shiftCount = countLeadingZeros64( zSig0 );
838
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839
    zExp -= shiftCount;
840
    return
841
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
842

    
843
}
844

    
845
#endif
846

    
847
#ifdef FLOAT128
848

    
849
/*----------------------------------------------------------------------------
850
| Returns the least-significant 64 fraction bits of the quadruple-precision
851
| floating-point value `a'.
852
*----------------------------------------------------------------------------*/
853

    
854
INLINE uint64_t extractFloat128Frac1( float128 a )
855
{
856

    
857
    return a.low;
858

    
859
}
860

    
861
/*----------------------------------------------------------------------------
862
| Returns the most-significant 48 fraction bits of the quadruple-precision
863
| floating-point value `a'.
864
*----------------------------------------------------------------------------*/
865

    
866
INLINE uint64_t extractFloat128Frac0( float128 a )
867
{
868

    
869
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
870

    
871
}
872

    
873
/*----------------------------------------------------------------------------
874
| Returns the exponent bits of the quadruple-precision floating-point value
875
| `a'.
876
*----------------------------------------------------------------------------*/
877

    
878
INLINE int32 extractFloat128Exp( float128 a )
879
{
880

    
881
    return ( a.high>>48 ) & 0x7FFF;
882

    
883
}
884

    
885
/*----------------------------------------------------------------------------
886
| Returns the sign bit of the quadruple-precision floating-point value `a'.
887
*----------------------------------------------------------------------------*/
888

    
889
INLINE flag extractFloat128Sign( float128 a )
890
{
891

    
892
    return a.high>>63;
893

    
894
}
895

    
896
/*----------------------------------------------------------------------------
897
| Normalizes the subnormal quadruple-precision floating-point value
898
| represented by the denormalized significand formed by the concatenation of
899
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
900
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
901
| significand are stored at the location pointed to by `zSig0Ptr', and the
902
| least significant 64 bits of the normalized significand are stored at the
903
| location pointed to by `zSig1Ptr'.
904
*----------------------------------------------------------------------------*/
905

    
906
static void
907
 normalizeFloat128Subnormal(
908
     uint64_t aSig0,
909
     uint64_t aSig1,
910
     int32 *zExpPtr,
911
     uint64_t *zSig0Ptr,
912
     uint64_t *zSig1Ptr
913
 )
914
{
915
    int8 shiftCount;
916

    
917
    if ( aSig0 == 0 ) {
918
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
919
        if ( shiftCount < 0 ) {
920
            *zSig0Ptr = aSig1>>( - shiftCount );
921
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
922
        }
923
        else {
924
            *zSig0Ptr = aSig1<<shiftCount;
925
            *zSig1Ptr = 0;
926
        }
927
        *zExpPtr = - shiftCount - 63;
928
    }
929
    else {
930
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
931
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932
        *zExpPtr = 1 - shiftCount;
933
    }
934

    
935
}
936

    
937
/*----------------------------------------------------------------------------
938
| Packs the sign `zSign', the exponent `zExp', and the significand formed
939
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940
| floating-point value, returning the result.  After being shifted into the
941
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942
| added together to form the most significant 32 bits of the result.  This
943
| means that any integer portion of `zSig0' will be added into the exponent.
944
| Since a properly normalized significand will have an integer portion equal
945
| to 1, the `zExp' input should be 1 less than the desired result exponent
946
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947
| significand.
948
*----------------------------------------------------------------------------*/
949

    
950
INLINE float128
951
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
952
{
953
    float128 z;
954

    
955
    z.low = zSig1;
956
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
957
    return z;
958

    
959
}
960

    
961
/*----------------------------------------------------------------------------
962
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963
| and extended significand formed by the concatenation of `zSig0', `zSig1',
964
| and `zSig2', and returns the proper quadruple-precision floating-point value
965
| corresponding to the abstract input.  Ordinarily, the abstract value is
966
| simply rounded and packed into the quadruple-precision format, with the
967
| inexact exception raised if the abstract input cannot be represented
968
| exactly.  However, if the abstract value is too large, the overflow and
969
| inexact exceptions are raised and an infinity or maximal finite value is
970
| returned.  If the abstract value is too small, the input value is rounded to
971
| a subnormal number, and the underflow and inexact exceptions are raised if
972
| the abstract input cannot be represented exactly as a subnormal quadruple-
973
| precision floating-point number.
974
|     The input significand must be normalized or smaller.  If the input
975
| significand is not normalized, `zExp' must be 0; in that case, the result
976
| returned is a subnormal number, and it must not require rounding.  In the
977
| usual case that the input significand is normalized, `zExp' must be 1 less
978
| than the ``true'' floating-point exponent.  The handling of underflow and
979
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980
*----------------------------------------------------------------------------*/
981

    
982
static float128
983
 roundAndPackFloat128(
984
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
985
{
986
    int8 roundingMode;
987
    flag roundNearestEven, increment, isTiny;
988

    
989
    roundingMode = STATUS(float_rounding_mode);
990
    roundNearestEven = ( roundingMode == float_round_nearest_even );
991
    increment = ( (int64_t) zSig2 < 0 );
992
    if ( ! roundNearestEven ) {
993
        if ( roundingMode == float_round_to_zero ) {
994
            increment = 0;
995
        }
996
        else {
997
            if ( zSign ) {
998
                increment = ( roundingMode == float_round_down ) && zSig2;
999
            }
1000
            else {
1001
                increment = ( roundingMode == float_round_up ) && zSig2;
1002
            }
1003
        }
1004
    }
1005
    if ( 0x7FFD <= (uint32_t) zExp ) {
1006
        if (    ( 0x7FFD < zExp )
1007
             || (    ( zExp == 0x7FFD )
1008
                  && eq128(
1009
                         LIT64( 0x0001FFFFFFFFFFFF ),
1010
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1011
                         zSig0,
1012
                         zSig1
1013
                     )
1014
                  && increment
1015
                )
1016
           ) {
1017
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018
            if (    ( roundingMode == float_round_to_zero )
1019
                 || ( zSign && ( roundingMode == float_round_up ) )
1020
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1021
               ) {
1022
                return
1023
                    packFloat128(
1024
                        zSign,
1025
                        0x7FFE,
1026
                        LIT64( 0x0000FFFFFFFFFFFF ),
1027
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1028
                    );
1029
            }
1030
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1031
        }
1032
        if ( zExp < 0 ) {
1033
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034
            isTiny =
1035
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036
                || ( zExp < -1 )
1037
                || ! increment
1038
                || lt128(
1039
                       zSig0,
1040
                       zSig1,
1041
                       LIT64( 0x0001FFFFFFFFFFFF ),
1042
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1043
                   );
1044
            shift128ExtraRightJamming(
1045
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046
            zExp = 0;
1047
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048
            if ( roundNearestEven ) {
1049
                increment = ( (int64_t) zSig2 < 0 );
1050
            }
1051
            else {
1052
                if ( zSign ) {
1053
                    increment = ( roundingMode == float_round_down ) && zSig2;
1054
                }
1055
                else {
1056
                    increment = ( roundingMode == float_round_up ) && zSig2;
1057
                }
1058
            }
1059
        }
1060
    }
1061
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062
    if ( increment ) {
1063
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065
    }
1066
    else {
1067
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068
    }
1069
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1070

    
1071
}
1072

    
1073
/*----------------------------------------------------------------------------
1074
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1076
| returns the proper quadruple-precision floating-point value corresponding
1077
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1078
| except that the input significand has fewer bits and does not have to be
1079
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080
| point exponent.
1081
*----------------------------------------------------------------------------*/
1082

    
1083
static float128
1084
 normalizeRoundAndPackFloat128(
1085
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086
{
1087
    int8 shiftCount;
1088
    uint64_t zSig2;
1089

    
1090
    if ( zSig0 == 0 ) {
1091
        zSig0 = zSig1;
1092
        zSig1 = 0;
1093
        zExp -= 64;
1094
    }
1095
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096
    if ( 0 <= shiftCount ) {
1097
        zSig2 = 0;
1098
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099
    }
1100
    else {
1101
        shift128ExtraRightJamming(
1102
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103
    }
1104
    zExp -= shiftCount;
1105
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106

    
1107
}
1108

    
1109
#endif
1110

    
1111
/*----------------------------------------------------------------------------
1112
| Returns the result of converting the 32-bit two's complement integer `a'
1113
| to the single-precision floating-point format.  The conversion is performed
1114
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115
*----------------------------------------------------------------------------*/
1116

    
1117
float32 int32_to_float32( int32 a STATUS_PARAM )
1118
{
1119
    flag zSign;
1120

    
1121
    if ( a == 0 ) return float32_zero;
1122
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123
    zSign = ( a < 0 );
1124
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125

    
1126
}
1127

    
1128
/*----------------------------------------------------------------------------
1129
| Returns the result of converting the 32-bit two's complement integer `a'
1130
| to the double-precision floating-point format.  The conversion is performed
1131
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132
*----------------------------------------------------------------------------*/
1133

    
1134
float64 int32_to_float64( int32 a STATUS_PARAM )
1135
{
1136
    flag zSign;
1137
    uint32 absA;
1138
    int8 shiftCount;
1139
    uint64_t zSig;
1140

    
1141
    if ( a == 0 ) return float64_zero;
1142
    zSign = ( a < 0 );
1143
    absA = zSign ? - a : a;
1144
    shiftCount = countLeadingZeros32( absA ) + 21;
1145
    zSig = absA;
1146
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147

    
1148
}
1149

    
1150
#ifdef FLOATX80
1151

    
1152
/*----------------------------------------------------------------------------
1153
| Returns the result of converting the 32-bit two's complement integer `a'
1154
| to the extended double-precision floating-point format.  The conversion
1155
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156
| Arithmetic.
1157
*----------------------------------------------------------------------------*/
1158

    
1159
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160
{
1161
    flag zSign;
1162
    uint32 absA;
1163
    int8 shiftCount;
1164
    uint64_t zSig;
1165

    
1166
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167
    zSign = ( a < 0 );
1168
    absA = zSign ? - a : a;
1169
    shiftCount = countLeadingZeros32( absA ) + 32;
1170
    zSig = absA;
1171
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172

    
1173
}
1174

    
1175
#endif
1176

    
1177
#ifdef FLOAT128
1178

    
1179
/*----------------------------------------------------------------------------
1180
| Returns the result of converting the 32-bit two's complement integer `a' to
1181
| the quadruple-precision floating-point format.  The conversion is performed
1182
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183
*----------------------------------------------------------------------------*/
1184

    
1185
float128 int32_to_float128( int32 a STATUS_PARAM )
1186
{
1187
    flag zSign;
1188
    uint32 absA;
1189
    int8 shiftCount;
1190
    uint64_t zSig0;
1191

    
1192
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193
    zSign = ( a < 0 );
1194
    absA = zSign ? - a : a;
1195
    shiftCount = countLeadingZeros32( absA ) + 17;
1196
    zSig0 = absA;
1197
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198

    
1199
}
1200

    
1201
#endif
1202

    
1203
/*----------------------------------------------------------------------------
1204
| Returns the result of converting the 64-bit two's complement integer `a'
1205
| to the single-precision floating-point format.  The conversion is performed
1206
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207
*----------------------------------------------------------------------------*/
1208

    
1209
float32 int64_to_float32( int64 a STATUS_PARAM )
1210
{
1211
    flag zSign;
1212
    uint64 absA;
1213
    int8 shiftCount;
1214

    
1215
    if ( a == 0 ) return float32_zero;
1216
    zSign = ( a < 0 );
1217
    absA = zSign ? - a : a;
1218
    shiftCount = countLeadingZeros64( absA ) - 40;
1219
    if ( 0 <= shiftCount ) {
1220
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221
    }
1222
    else {
1223
        shiftCount += 7;
1224
        if ( shiftCount < 0 ) {
1225
            shift64RightJamming( absA, - shiftCount, &absA );
1226
        }
1227
        else {
1228
            absA <<= shiftCount;
1229
        }
1230
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231
    }
1232

    
1233
}
1234

    
1235
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236
{
1237
    int8 shiftCount;
1238

    
1239
    if ( a == 0 ) return float32_zero;
1240
    shiftCount = countLeadingZeros64( a ) - 40;
1241
    if ( 0 <= shiftCount ) {
1242
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243
    }
1244
    else {
1245
        shiftCount += 7;
1246
        if ( shiftCount < 0 ) {
1247
            shift64RightJamming( a, - shiftCount, &a );
1248
        }
1249
        else {
1250
            a <<= shiftCount;
1251
        }
1252
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253
    }
1254
}
1255

    
1256
/*----------------------------------------------------------------------------
1257
| Returns the result of converting the 64-bit two's complement integer `a'
1258
| to the double-precision floating-point format.  The conversion is performed
1259
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260
*----------------------------------------------------------------------------*/
1261

    
1262
float64 int64_to_float64( int64 a STATUS_PARAM )
1263
{
1264
    flag zSign;
1265

    
1266
    if ( a == 0 ) return float64_zero;
1267
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268
        return packFloat64( 1, 0x43E, 0 );
1269
    }
1270
    zSign = ( a < 0 );
1271
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272

    
1273
}
1274

    
1275
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276
{
1277
    if ( a == 0 ) return float64_zero;
1278
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279

    
1280
}
1281

    
1282
#ifdef FLOATX80
1283

    
1284
/*----------------------------------------------------------------------------
1285
| Returns the result of converting the 64-bit two's complement integer `a'
1286
| to the extended double-precision floating-point format.  The conversion
1287
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288
| Arithmetic.
1289
*----------------------------------------------------------------------------*/
1290

    
1291
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292
{
1293
    flag zSign;
1294
    uint64 absA;
1295
    int8 shiftCount;
1296

    
1297
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298
    zSign = ( a < 0 );
1299
    absA = zSign ? - a : a;
1300
    shiftCount = countLeadingZeros64( absA );
1301
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302

    
1303
}
1304

    
1305
#endif
1306

    
1307
#ifdef FLOAT128
1308

    
1309
/*----------------------------------------------------------------------------
1310
| Returns the result of converting the 64-bit two's complement integer `a' to
1311
| the quadruple-precision floating-point format.  The conversion is performed
1312
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313
*----------------------------------------------------------------------------*/
1314

    
1315
float128 int64_to_float128( int64 a STATUS_PARAM )
1316
{
1317
    flag zSign;
1318
    uint64 absA;
1319
    int8 shiftCount;
1320
    int32 zExp;
1321
    uint64_t zSig0, zSig1;
1322

    
1323
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324
    zSign = ( a < 0 );
1325
    absA = zSign ? - a : a;
1326
    shiftCount = countLeadingZeros64( absA ) + 49;
1327
    zExp = 0x406E - shiftCount;
1328
    if ( 64 <= shiftCount ) {
1329
        zSig1 = 0;
1330
        zSig0 = absA;
1331
        shiftCount -= 64;
1332
    }
1333
    else {
1334
        zSig1 = absA;
1335
        zSig0 = 0;
1336
    }
1337
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1339

    
1340
}
1341

    
1342
#endif
1343

    
1344
/*----------------------------------------------------------------------------
1345
| Returns the result of converting the single-precision floating-point value
1346
| `a' to the 32-bit two's complement integer format.  The conversion is
1347
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1348
| Arithmetic---which means in particular that the conversion is rounded
1349
| according to the current rounding mode.  If `a' is a NaN, the largest
1350
| positive integer is returned.  Otherwise, if the conversion overflows, the
1351
| largest integer with the same sign as `a' is returned.
1352
*----------------------------------------------------------------------------*/
1353

    
1354
int32 float32_to_int32( float32 a STATUS_PARAM )
1355
{
1356
    flag aSign;
1357
    int16 aExp, shiftCount;
1358
    uint32_t aSig;
1359
    uint64_t aSig64;
1360

    
1361
    a = float32_squash_input_denormal(a STATUS_VAR);
1362
    aSig = extractFloat32Frac( a );
1363
    aExp = extractFloat32Exp( a );
1364
    aSign = extractFloat32Sign( a );
1365
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366
    if ( aExp ) aSig |= 0x00800000;
1367
    shiftCount = 0xAF - aExp;
1368
    aSig64 = aSig;
1369
    aSig64 <<= 32;
1370
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372

    
1373
}
1374

    
1375
/*----------------------------------------------------------------------------
1376
| Returns the result of converting the single-precision floating-point value
1377
| `a' to the 32-bit two's complement integer format.  The conversion is
1378
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1379
| Arithmetic, except that the conversion is always rounded toward zero.
1380
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381
| the conversion overflows, the largest integer with the same sign as `a' is
1382
| returned.
1383
*----------------------------------------------------------------------------*/
1384

    
1385
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386
{
1387
    flag aSign;
1388
    int16 aExp, shiftCount;
1389
    uint32_t aSig;
1390
    int32 z;
1391
    a = float32_squash_input_denormal(a STATUS_VAR);
1392

    
1393
    aSig = extractFloat32Frac( a );
1394
    aExp = extractFloat32Exp( a );
1395
    aSign = extractFloat32Sign( a );
1396
    shiftCount = aExp - 0x9E;
1397
    if ( 0 <= shiftCount ) {
1398
        if ( float32_val(a) != 0xCF000000 ) {
1399
            float_raise( float_flag_invalid STATUS_VAR);
1400
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401
        }
1402
        return (int32_t) 0x80000000;
1403
    }
1404
    else if ( aExp <= 0x7E ) {
1405
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406
        return 0;
1407
    }
1408
    aSig = ( aSig | 0x00800000 )<<8;
1409
    z = aSig>>( - shiftCount );
1410
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411
        STATUS(float_exception_flags) |= float_flag_inexact;
1412
    }
1413
    if ( aSign ) z = - z;
1414
    return z;
1415

    
1416
}
1417

    
1418
/*----------------------------------------------------------------------------
1419
| Returns the result of converting the single-precision floating-point value
1420
| `a' to the 16-bit two's complement integer format.  The conversion is
1421
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1422
| Arithmetic, except that the conversion is always rounded toward zero.
1423
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424
| the conversion overflows, the largest integer with the same sign as `a' is
1425
| returned.
1426
*----------------------------------------------------------------------------*/
1427

    
1428
int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429
{
1430
    flag aSign;
1431
    int16 aExp, shiftCount;
1432
    uint32_t aSig;
1433
    int32 z;
1434

    
1435
    aSig = extractFloat32Frac( a );
1436
    aExp = extractFloat32Exp( a );
1437
    aSign = extractFloat32Sign( a );
1438
    shiftCount = aExp - 0x8E;
1439
    if ( 0 <= shiftCount ) {
1440
        if ( float32_val(a) != 0xC7000000 ) {
1441
            float_raise( float_flag_invalid STATUS_VAR);
1442
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443
                return 0x7FFF;
1444
            }
1445
        }
1446
        return (int32_t) 0xffff8000;
1447
    }
1448
    else if ( aExp <= 0x7E ) {
1449
        if ( aExp | aSig ) {
1450
            STATUS(float_exception_flags) |= float_flag_inexact;
1451
        }
1452
        return 0;
1453
    }
1454
    shiftCount -= 0x10;
1455
    aSig = ( aSig | 0x00800000 )<<8;
1456
    z = aSig>>( - shiftCount );
1457
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458
        STATUS(float_exception_flags) |= float_flag_inexact;
1459
    }
1460
    if ( aSign ) {
1461
        z = - z;
1462
    }
1463
    return z;
1464

    
1465
}
1466

    
1467
/*----------------------------------------------------------------------------
1468
| Returns the result of converting the single-precision floating-point value
1469
| `a' to the 64-bit two's complement integer format.  The conversion is
1470
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1471
| Arithmetic---which means in particular that the conversion is rounded
1472
| according to the current rounding mode.  If `a' is a NaN, the largest
1473
| positive integer is returned.  Otherwise, if the conversion overflows, the
1474
| largest integer with the same sign as `a' is returned.
1475
*----------------------------------------------------------------------------*/
1476

    
1477
int64 float32_to_int64( float32 a STATUS_PARAM )
1478
{
1479
    flag aSign;
1480
    int16 aExp, shiftCount;
1481
    uint32_t aSig;
1482
    uint64_t aSig64, aSigExtra;
1483
    a = float32_squash_input_denormal(a STATUS_VAR);
1484

    
1485
    aSig = extractFloat32Frac( a );
1486
    aExp = extractFloat32Exp( a );
1487
    aSign = extractFloat32Sign( a );
1488
    shiftCount = 0xBE - aExp;
1489
    if ( shiftCount < 0 ) {
1490
        float_raise( float_flag_invalid STATUS_VAR);
1491
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1493
        }
1494
        return (int64_t) LIT64( 0x8000000000000000 );
1495
    }
1496
    if ( aExp ) aSig |= 0x00800000;
1497
    aSig64 = aSig;
1498
    aSig64 <<= 40;
1499
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501

    
1502
}
1503

    
1504
/*----------------------------------------------------------------------------
1505
| Returns the result of converting the single-precision floating-point value
1506
| `a' to the 64-bit two's complement integer format.  The conversion is
1507
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1508
| Arithmetic, except that the conversion is always rounded toward zero.  If
1509
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510
| conversion overflows, the largest integer with the same sign as `a' is
1511
| returned.
1512
*----------------------------------------------------------------------------*/
1513

    
1514
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515
{
1516
    flag aSign;
1517
    int16 aExp, shiftCount;
1518
    uint32_t aSig;
1519
    uint64_t aSig64;
1520
    int64 z;
1521
    a = float32_squash_input_denormal(a STATUS_VAR);
1522

    
1523
    aSig = extractFloat32Frac( a );
1524
    aExp = extractFloat32Exp( a );
1525
    aSign = extractFloat32Sign( a );
1526
    shiftCount = aExp - 0xBE;
1527
    if ( 0 <= shiftCount ) {
1528
        if ( float32_val(a) != 0xDF000000 ) {
1529
            float_raise( float_flag_invalid STATUS_VAR);
1530
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1532
            }
1533
        }
1534
        return (int64_t) LIT64( 0x8000000000000000 );
1535
    }
1536
    else if ( aExp <= 0x7E ) {
1537
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538
        return 0;
1539
    }
1540
    aSig64 = aSig | 0x00800000;
1541
    aSig64 <<= 40;
1542
    z = aSig64>>( - shiftCount );
1543
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544
        STATUS(float_exception_flags) |= float_flag_inexact;
1545
    }
1546
    if ( aSign ) z = - z;
1547
    return z;
1548

    
1549
}
1550

    
1551
/*----------------------------------------------------------------------------
1552
| Returns the result of converting the single-precision floating-point value
1553
| `a' to the double-precision floating-point format.  The conversion is
1554
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1555
| Arithmetic.
1556
*----------------------------------------------------------------------------*/
1557

    
1558
float64 float32_to_float64( float32 a STATUS_PARAM )
1559
{
1560
    flag aSign;
1561
    int16 aExp;
1562
    uint32_t aSig;
1563
    a = float32_squash_input_denormal(a STATUS_VAR);
1564

    
1565
    aSig = extractFloat32Frac( a );
1566
    aExp = extractFloat32Exp( a );
1567
    aSign = extractFloat32Sign( a );
1568
    if ( aExp == 0xFF ) {
1569
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570
        return packFloat64( aSign, 0x7FF, 0 );
1571
    }
1572
    if ( aExp == 0 ) {
1573
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575
        --aExp;
1576
    }
1577
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578

    
1579
}
1580

    
1581
#ifdef FLOATX80
1582

    
1583
/*----------------------------------------------------------------------------
1584
| Returns the result of converting the single-precision floating-point value
1585
| `a' to the extended double-precision floating-point format.  The conversion
1586
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587
| Arithmetic.
1588
*----------------------------------------------------------------------------*/
1589

    
1590
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591
{
1592
    flag aSign;
1593
    int16 aExp;
1594
    uint32_t aSig;
1595

    
1596
    a = float32_squash_input_denormal(a STATUS_VAR);
1597
    aSig = extractFloat32Frac( a );
1598
    aExp = extractFloat32Exp( a );
1599
    aSign = extractFloat32Sign( a );
1600
    if ( aExp == 0xFF ) {
1601
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603
    }
1604
    if ( aExp == 0 ) {
1605
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607
    }
1608
    aSig |= 0x00800000;
1609
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610

    
1611
}
1612

    
1613
#endif
1614

    
1615
#ifdef FLOAT128
1616

    
1617
/*----------------------------------------------------------------------------
1618
| Returns the result of converting the single-precision floating-point value
1619
| `a' to the double-precision floating-point format.  The conversion is
1620
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1621
| Arithmetic.
1622
*----------------------------------------------------------------------------*/
1623

    
1624
float128 float32_to_float128( float32 a STATUS_PARAM )
1625
{
1626
    flag aSign;
1627
    int16 aExp;
1628
    uint32_t aSig;
1629

    
1630
    a = float32_squash_input_denormal(a STATUS_VAR);
1631
    aSig = extractFloat32Frac( a );
1632
    aExp = extractFloat32Exp( a );
1633
    aSign = extractFloat32Sign( a );
1634
    if ( aExp == 0xFF ) {
1635
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1637
    }
1638
    if ( aExp == 0 ) {
1639
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641
        --aExp;
1642
    }
1643
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644

    
1645
}
1646

    
1647
#endif
1648

    
1649
/*----------------------------------------------------------------------------
1650
| Rounds the single-precision floating-point value `a' to an integer, and
1651
| returns the result as a single-precision floating-point value.  The
1652
| operation is performed according to the IEC/IEEE Standard for Binary
1653
| Floating-Point Arithmetic.
1654
*----------------------------------------------------------------------------*/
1655

    
1656
float32 float32_round_to_int( float32 a STATUS_PARAM)
1657
{
1658
    flag aSign;
1659
    int16 aExp;
1660
    uint32_t lastBitMask, roundBitsMask;
1661
    int8 roundingMode;
1662
    uint32_t z;
1663
    a = float32_squash_input_denormal(a STATUS_VAR);
1664

    
1665
    aExp = extractFloat32Exp( a );
1666
    if ( 0x96 <= aExp ) {
1667
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668
            return propagateFloat32NaN( a, a STATUS_VAR );
1669
        }
1670
        return a;
1671
    }
1672
    if ( aExp <= 0x7E ) {
1673
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674
        STATUS(float_exception_flags) |= float_flag_inexact;
1675
        aSign = extractFloat32Sign( a );
1676
        switch ( STATUS(float_rounding_mode) ) {
1677
         case float_round_nearest_even:
1678
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679
                return packFloat32( aSign, 0x7F, 0 );
1680
            }
1681
            break;
1682
         case float_round_down:
1683
            return make_float32(aSign ? 0xBF800000 : 0);
1684
         case float_round_up:
1685
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686
        }
1687
        return packFloat32( aSign, 0, 0 );
1688
    }
1689
    lastBitMask = 1;
1690
    lastBitMask <<= 0x96 - aExp;
1691
    roundBitsMask = lastBitMask - 1;
1692
    z = float32_val(a);
1693
    roundingMode = STATUS(float_rounding_mode);
1694
    if ( roundingMode == float_round_nearest_even ) {
1695
        z += lastBitMask>>1;
1696
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697
    }
1698
    else if ( roundingMode != float_round_to_zero ) {
1699
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700
            z += roundBitsMask;
1701
        }
1702
    }
1703
    z &= ~ roundBitsMask;
1704
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705
    return make_float32(z);
1706

    
1707
}
1708

    
1709
/*----------------------------------------------------------------------------
1710
| Returns the result of adding the absolute values of the single-precision
1711
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712
| before being returned.  `zSign' is ignored if the result is a NaN.
1713
| The addition is performed according to the IEC/IEEE Standard for Binary
1714
| Floating-Point Arithmetic.
1715
*----------------------------------------------------------------------------*/
1716

    
1717
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718
{
1719
    int16 aExp, bExp, zExp;
1720
    uint32_t aSig, bSig, zSig;
1721
    int16 expDiff;
1722

    
1723
    aSig = extractFloat32Frac( a );
1724
    aExp = extractFloat32Exp( a );
1725
    bSig = extractFloat32Frac( b );
1726
    bExp = extractFloat32Exp( b );
1727
    expDiff = aExp - bExp;
1728
    aSig <<= 6;
1729
    bSig <<= 6;
1730
    if ( 0 < expDiff ) {
1731
        if ( aExp == 0xFF ) {
1732
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733
            return a;
1734
        }
1735
        if ( bExp == 0 ) {
1736
            --expDiff;
1737
        }
1738
        else {
1739
            bSig |= 0x20000000;
1740
        }
1741
        shift32RightJamming( bSig, expDiff, &bSig );
1742
        zExp = aExp;
1743
    }
1744
    else if ( expDiff < 0 ) {
1745
        if ( bExp == 0xFF ) {
1746
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747
            return packFloat32( zSign, 0xFF, 0 );
1748
        }
1749
        if ( aExp == 0 ) {
1750
            ++expDiff;
1751
        }
1752
        else {
1753
            aSig |= 0x20000000;
1754
        }
1755
        shift32RightJamming( aSig, - expDiff, &aSig );
1756
        zExp = bExp;
1757
    }
1758
    else {
1759
        if ( aExp == 0xFF ) {
1760
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761
            return a;
1762
        }
1763
        if ( aExp == 0 ) {
1764
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766
        }
1767
        zSig = 0x40000000 + aSig + bSig;
1768
        zExp = aExp;
1769
        goto roundAndPack;
1770
    }
1771
    aSig |= 0x20000000;
1772
    zSig = ( aSig + bSig )<<1;
1773
    --zExp;
1774
    if ( (int32_t) zSig < 0 ) {
1775
        zSig = aSig + bSig;
1776
        ++zExp;
1777
    }
1778
 roundAndPack:
1779
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780

    
1781
}
1782

    
1783
/*----------------------------------------------------------------------------
1784
| Returns the result of subtracting the absolute values of the single-
1785
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786
| difference is negated before being returned.  `zSign' is ignored if the
1787
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788
| Standard for Binary Floating-Point Arithmetic.
1789
*----------------------------------------------------------------------------*/
1790

    
1791
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792
{
1793
    int16 aExp, bExp, zExp;
1794
    uint32_t aSig, bSig, zSig;
1795
    int16 expDiff;
1796

    
1797
    aSig = extractFloat32Frac( a );
1798
    aExp = extractFloat32Exp( a );
1799
    bSig = extractFloat32Frac( b );
1800
    bExp = extractFloat32Exp( b );
1801
    expDiff = aExp - bExp;
1802
    aSig <<= 7;
1803
    bSig <<= 7;
1804
    if ( 0 < expDiff ) goto aExpBigger;
1805
    if ( expDiff < 0 ) goto bExpBigger;
1806
    if ( aExp == 0xFF ) {
1807
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808
        float_raise( float_flag_invalid STATUS_VAR);
1809
        return float32_default_nan;
1810
    }
1811
    if ( aExp == 0 ) {
1812
        aExp = 1;
1813
        bExp = 1;
1814
    }
1815
    if ( bSig < aSig ) goto aBigger;
1816
    if ( aSig < bSig ) goto bBigger;
1817
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818
 bExpBigger:
1819
    if ( bExp == 0xFF ) {
1820
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1822
    }
1823
    if ( aExp == 0 ) {
1824
        ++expDiff;
1825
    }
1826
    else {
1827
        aSig |= 0x40000000;
1828
    }
1829
    shift32RightJamming( aSig, - expDiff, &aSig );
1830
    bSig |= 0x40000000;
1831
 bBigger:
1832
    zSig = bSig - aSig;
1833
    zExp = bExp;
1834
    zSign ^= 1;
1835
    goto normalizeRoundAndPack;
1836
 aExpBigger:
1837
    if ( aExp == 0xFF ) {
1838
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839
        return a;
1840
    }
1841
    if ( bExp == 0 ) {
1842
        --expDiff;
1843
    }
1844
    else {
1845
        bSig |= 0x40000000;
1846
    }
1847
    shift32RightJamming( bSig, expDiff, &bSig );
1848
    aSig |= 0x40000000;
1849
 aBigger:
1850
    zSig = aSig - bSig;
1851
    zExp = aExp;
1852
 normalizeRoundAndPack:
1853
    --zExp;
1854
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855

    
1856
}
1857

    
1858
/*----------------------------------------------------------------------------
1859
| Returns the result of adding the single-precision floating-point values `a'
1860
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861
| Binary Floating-Point Arithmetic.
1862
*----------------------------------------------------------------------------*/
1863

    
1864
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865
{
1866
    flag aSign, bSign;
1867
    a = float32_squash_input_denormal(a STATUS_VAR);
1868
    b = float32_squash_input_denormal(b STATUS_VAR);
1869

    
1870
    aSign = extractFloat32Sign( a );
1871
    bSign = extractFloat32Sign( b );
1872
    if ( aSign == bSign ) {
1873
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874
    }
1875
    else {
1876
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877
    }
1878

    
1879
}
1880

    
1881
/*----------------------------------------------------------------------------
1882
| Returns the result of subtracting the single-precision floating-point values
1883
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884
| for Binary Floating-Point Arithmetic.
1885
*----------------------------------------------------------------------------*/
1886

    
1887
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888
{
1889
    flag aSign, bSign;
1890
    a = float32_squash_input_denormal(a STATUS_VAR);
1891
    b = float32_squash_input_denormal(b STATUS_VAR);
1892

    
1893
    aSign = extractFloat32Sign( a );
1894
    bSign = extractFloat32Sign( b );
1895
    if ( aSign == bSign ) {
1896
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897
    }
1898
    else {
1899
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900
    }
1901

    
1902
}
1903

    
1904
/*----------------------------------------------------------------------------
1905
| Returns the result of multiplying the single-precision floating-point values
1906
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907
| for Binary Floating-Point Arithmetic.
1908
*----------------------------------------------------------------------------*/
1909

    
1910
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911
{
1912
    flag aSign, bSign, zSign;
1913
    int16 aExp, bExp, zExp;
1914
    uint32_t aSig, bSig;
1915
    uint64_t zSig64;
1916
    uint32_t zSig;
1917

    
1918
    a = float32_squash_input_denormal(a STATUS_VAR);
1919
    b = float32_squash_input_denormal(b STATUS_VAR);
1920

    
1921
    aSig = extractFloat32Frac( a );
1922
    aExp = extractFloat32Exp( a );
1923
    aSign = extractFloat32Sign( a );
1924
    bSig = extractFloat32Frac( b );
1925
    bExp = extractFloat32Exp( b );
1926
    bSign = extractFloat32Sign( b );
1927
    zSign = aSign ^ bSign;
1928
    if ( aExp == 0xFF ) {
1929
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930
            return propagateFloat32NaN( a, b STATUS_VAR );
1931
        }
1932
        if ( ( bExp | bSig ) == 0 ) {
1933
            float_raise( float_flag_invalid STATUS_VAR);
1934
            return float32_default_nan;
1935
        }
1936
        return packFloat32( zSign, 0xFF, 0 );
1937
    }
1938
    if ( bExp == 0xFF ) {
1939
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940
        if ( ( aExp | aSig ) == 0 ) {
1941
            float_raise( float_flag_invalid STATUS_VAR);
1942
            return float32_default_nan;
1943
        }
1944
        return packFloat32( zSign, 0xFF, 0 );
1945
    }
1946
    if ( aExp == 0 ) {
1947
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949
    }
1950
    if ( bExp == 0 ) {
1951
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953
    }
1954
    zExp = aExp + bExp - 0x7F;
1955
    aSig = ( aSig | 0x00800000 )<<7;
1956
    bSig = ( bSig | 0x00800000 )<<8;
1957
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958
    zSig = zSig64;
1959
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960
        zSig <<= 1;
1961
        --zExp;
1962
    }
1963
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964

    
1965
}
1966

    
1967
/*----------------------------------------------------------------------------
1968
| Returns the result of dividing the single-precision floating-point value `a'
1969
| by the corresponding value `b'.  The operation is performed according to the
1970
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971
*----------------------------------------------------------------------------*/
1972

    
1973
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974
{
1975
    flag aSign, bSign, zSign;
1976
    int16 aExp, bExp, zExp;
1977
    uint32_t aSig, bSig, zSig;
1978
    a = float32_squash_input_denormal(a STATUS_VAR);
1979
    b = float32_squash_input_denormal(b STATUS_VAR);
1980

    
1981
    aSig = extractFloat32Frac( a );
1982
    aExp = extractFloat32Exp( a );
1983
    aSign = extractFloat32Sign( a );
1984
    bSig = extractFloat32Frac( b );
1985
    bExp = extractFloat32Exp( b );
1986
    bSign = extractFloat32Sign( b );
1987
    zSign = aSign ^ bSign;
1988
    if ( aExp == 0xFF ) {
1989
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990
        if ( bExp == 0xFF ) {
1991
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992
            float_raise( float_flag_invalid STATUS_VAR);
1993
            return float32_default_nan;
1994
        }
1995
        return packFloat32( zSign, 0xFF, 0 );
1996
    }
1997
    if ( bExp == 0xFF ) {
1998
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999
        return packFloat32( zSign, 0, 0 );
2000
    }
2001
    if ( bExp == 0 ) {
2002
        if ( bSig == 0 ) {
2003
            if ( ( aExp | aSig ) == 0 ) {
2004
                float_raise( float_flag_invalid STATUS_VAR);
2005
                return float32_default_nan;
2006
            }
2007
            float_raise( float_flag_divbyzero STATUS_VAR);
2008
            return packFloat32( zSign, 0xFF, 0 );
2009
        }
2010
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011
    }
2012
    if ( aExp == 0 ) {
2013
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015
    }
2016
    zExp = aExp - bExp + 0x7D;
2017
    aSig = ( aSig | 0x00800000 )<<7;
2018
    bSig = ( bSig | 0x00800000 )<<8;
2019
    if ( bSig <= ( aSig + aSig ) ) {
2020
        aSig >>= 1;
2021
        ++zExp;
2022
    }
2023
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024
    if ( ( zSig & 0x3F ) == 0 ) {
2025
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026
    }
2027
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028

    
2029
}
2030

    
2031
/*----------------------------------------------------------------------------
2032
| Returns the remainder of the single-precision floating-point value `a'
2033
| with respect to the corresponding value `b'.  The operation is performed
2034
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035
*----------------------------------------------------------------------------*/
2036

    
2037
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038
{
2039
    flag aSign, zSign;
2040
    int16 aExp, bExp, expDiff;
2041
    uint32_t aSig, bSig;
2042
    uint32_t q;
2043
    uint64_t aSig64, bSig64, q64;
2044
    uint32_t alternateASig;
2045
    int32_t sigMean;
2046
    a = float32_squash_input_denormal(a STATUS_VAR);
2047
    b = float32_squash_input_denormal(b STATUS_VAR);
2048

    
2049
    aSig = extractFloat32Frac( a );
2050
    aExp = extractFloat32Exp( a );
2051
    aSign = extractFloat32Sign( a );
2052
    bSig = extractFloat32Frac( b );
2053
    bExp = extractFloat32Exp( b );
2054
    if ( aExp == 0xFF ) {
2055
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056
            return propagateFloat32NaN( a, b STATUS_VAR );
2057
        }
2058
        float_raise( float_flag_invalid STATUS_VAR);
2059
        return float32_default_nan;
2060
    }
2061
    if ( bExp == 0xFF ) {
2062
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063
        return a;
2064
    }
2065
    if ( bExp == 0 ) {
2066
        if ( bSig == 0 ) {
2067
            float_raise( float_flag_invalid STATUS_VAR);
2068
            return float32_default_nan;
2069
        }
2070
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071
    }
2072
    if ( aExp == 0 ) {
2073
        if ( aSig == 0 ) return a;
2074
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075
    }
2076
    expDiff = aExp - bExp;
2077
    aSig |= 0x00800000;
2078
    bSig |= 0x00800000;
2079
    if ( expDiff < 32 ) {
2080
        aSig <<= 8;
2081
        bSig <<= 8;
2082
        if ( expDiff < 0 ) {
2083
            if ( expDiff < -1 ) return a;
2084
            aSig >>= 1;
2085
        }
2086
        q = ( bSig <= aSig );
2087
        if ( q ) aSig -= bSig;
2088
        if ( 0 < expDiff ) {
2089
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090
            q >>= 32 - expDiff;
2091
            bSig >>= 2;
2092
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093
        }
2094
        else {
2095
            aSig >>= 2;
2096
            bSig >>= 2;
2097
        }
2098
    }
2099
    else {
2100
        if ( bSig <= aSig ) aSig -= bSig;
2101
        aSig64 = ( (uint64_t) aSig )<<40;
2102
        bSig64 = ( (uint64_t) bSig )<<40;
2103
        expDiff -= 64;
2104
        while ( 0 < expDiff ) {
2105
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107
            aSig64 = - ( ( bSig * q64 )<<38 );
2108
            expDiff -= 62;
2109
        }
2110
        expDiff += 64;
2111
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113
        q = q64>>( 64 - expDiff );
2114
        bSig <<= 6;
2115
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116
    }
2117
    do {
2118
        alternateASig = aSig;
2119
        ++q;
2120
        aSig -= bSig;
2121
    } while ( 0 <= (int32_t) aSig );
2122
    sigMean = aSig + alternateASig;
2123
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124
        aSig = alternateASig;
2125
    }
2126
    zSign = ( (int32_t) aSig < 0 );
2127
    if ( zSign ) aSig = - aSig;
2128
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129

    
2130
}
2131

    
2132
/*----------------------------------------------------------------------------
2133
| Returns the square root of the single-precision floating-point value `a'.
2134
| The operation is performed according to the IEC/IEEE Standard for Binary
2135
| Floating-Point Arithmetic.
2136
*----------------------------------------------------------------------------*/
2137

    
2138
float32 float32_sqrt( float32 a STATUS_PARAM )
2139
{
2140
    flag aSign;
2141
    int16 aExp, zExp;
2142
    uint32_t aSig, zSig;
2143
    uint64_t rem, term;
2144
    a = float32_squash_input_denormal(a STATUS_VAR);
2145

    
2146
    aSig = extractFloat32Frac( a );
2147
    aExp = extractFloat32Exp( a );
2148
    aSign = extractFloat32Sign( a );
2149
    if ( aExp == 0xFF ) {
2150
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151
        if ( ! aSign ) return a;
2152
        float_raise( float_flag_invalid STATUS_VAR);
2153
        return float32_default_nan;
2154
    }
2155
    if ( aSign ) {
2156
        if ( ( aExp | aSig ) == 0 ) return a;
2157
        float_raise( float_flag_invalid STATUS_VAR);
2158
        return float32_default_nan;
2159
    }
2160
    if ( aExp == 0 ) {
2161
        if ( aSig == 0 ) return float32_zero;
2162
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163
    }
2164
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165
    aSig = ( aSig | 0x00800000 )<<8;
2166
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2167
    if ( ( zSig & 0x7F ) <= 5 ) {
2168
        if ( zSig < 2 ) {
2169
            zSig = 0x7FFFFFFF;
2170
            goto roundAndPack;
2171
        }
2172
        aSig >>= aExp & 1;
2173
        term = ( (uint64_t) zSig ) * zSig;
2174
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175
        while ( (int64_t) rem < 0 ) {
2176
            --zSig;
2177
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178
        }
2179
        zSig |= ( rem != 0 );
2180
    }
2181
    shift32RightJamming( zSig, 1, &zSig );
2182
 roundAndPack:
2183
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184

    
2185
}
2186

    
2187
/*----------------------------------------------------------------------------
2188
| Returns the binary exponential of the single-precision floating-point value
2189
| `a'. The operation is performed according to the IEC/IEEE Standard for
2190
| Binary Floating-Point Arithmetic.
2191
|
2192
| Uses the following identities:
2193
|
2194
| 1. -------------------------------------------------------------------------
2195
|      x    x*ln(2)
2196
|     2  = e
2197
|
2198
| 2. -------------------------------------------------------------------------
2199
|                      2     3     4     5           n
2200
|      x        x     x     x     x     x           x
2201
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202
|               1!    2!    3!    4!    5!          n!
2203
*----------------------------------------------------------------------------*/
2204

    
2205
static const float64 float32_exp2_coefficients[15] =
2206
{
2207
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2208
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2209
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2210
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2211
    const_float64( 0x3f81111111111111ll ), /*  5 */
2212
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222
};
2223

    
2224
float32 float32_exp2( float32 a STATUS_PARAM )
2225
{
2226
    flag aSign;
2227
    int16 aExp;
2228
    uint32_t aSig;
2229
    float64 r, x, xn;
2230
    int i;
2231
    a = float32_squash_input_denormal(a STATUS_VAR);
2232

    
2233
    aSig = extractFloat32Frac( a );
2234
    aExp = extractFloat32Exp( a );
2235
    aSign = extractFloat32Sign( a );
2236

    
2237
    if ( aExp == 0xFF) {
2238
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239
        return (aSign) ? float32_zero : a;
2240
    }
2241
    if (aExp == 0) {
2242
        if (aSig == 0) return float32_one;
2243
    }
2244

    
2245
    float_raise( float_flag_inexact STATUS_VAR);
2246

    
2247
    /* ******************************* */
2248
    /* using float64 for approximation */
2249
    /* ******************************* */
2250
    x = float32_to_float64(a STATUS_VAR);
2251
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2252

    
2253
    xn = x;
2254
    r = float64_one;
2255
    for (i = 0 ; i < 15 ; i++) {
2256
        float64 f;
2257

    
2258
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259
        r = float64_add(r, f STATUS_VAR);
2260

    
2261
        xn = float64_mul(xn, x STATUS_VAR);
2262
    }
2263

    
2264
    return float64_to_float32(r, status);
2265
}
2266

    
2267
/*----------------------------------------------------------------------------
2268
| Returns the binary log of the single-precision floating-point value `a'.
2269
| The operation is performed according to the IEC/IEEE Standard for Binary
2270
| Floating-Point Arithmetic.
2271
*----------------------------------------------------------------------------*/
2272
float32 float32_log2( float32 a STATUS_PARAM )
2273
{
2274
    flag aSign, zSign;
2275
    int16 aExp;
2276
    uint32_t aSig, zSig, i;
2277

    
2278
    a = float32_squash_input_denormal(a STATUS_VAR);
2279
    aSig = extractFloat32Frac( a );
2280
    aExp = extractFloat32Exp( a );
2281
    aSign = extractFloat32Sign( a );
2282

    
2283
    if ( aExp == 0 ) {
2284
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286
    }
2287
    if ( aSign ) {
2288
        float_raise( float_flag_invalid STATUS_VAR);
2289
        return float32_default_nan;
2290
    }
2291
    if ( aExp == 0xFF ) {
2292
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293
        return a;
2294
    }
2295

    
2296
    aExp -= 0x7F;
2297
    aSig |= 0x00800000;
2298
    zSign = aExp < 0;
2299
    zSig = aExp << 23;
2300

    
2301
    for (i = 1 << 22; i > 0; i >>= 1) {
2302
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303
        if ( aSig & 0x01000000 ) {
2304
            aSig >>= 1;
2305
            zSig |= i;
2306
        }
2307
    }
2308

    
2309
    if ( zSign )
2310
        zSig = -zSig;
2311

    
2312
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313
}
2314

    
2315
/*----------------------------------------------------------------------------
2316
| Returns 1 if the single-precision floating-point value `a' is equal to
2317
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2318
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319
*----------------------------------------------------------------------------*/
2320

    
2321
int float32_eq( float32 a, float32 b STATUS_PARAM )
2322
{
2323
    a = float32_squash_input_denormal(a STATUS_VAR);
2324
    b = float32_squash_input_denormal(b STATUS_VAR);
2325

    
2326
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2328
       ) {
2329
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330
            float_raise( float_flag_invalid STATUS_VAR);
2331
        }
2332
        return 0;
2333
    }
2334
    return ( float32_val(a) == float32_val(b) ) ||
2335
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2336

    
2337
}
2338

    
2339
/*----------------------------------------------------------------------------
2340
| Returns 1 if the single-precision floating-point value `a' is less than
2341
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2342
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343
| Arithmetic.
2344
*----------------------------------------------------------------------------*/
2345

    
2346
int float32_le( float32 a, float32 b STATUS_PARAM )
2347
{
2348
    flag aSign, bSign;
2349
    uint32_t av, bv;
2350
    a = float32_squash_input_denormal(a STATUS_VAR);
2351
    b = float32_squash_input_denormal(b STATUS_VAR);
2352

    
2353
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355
       ) {
2356
        float_raise( float_flag_invalid STATUS_VAR);
2357
        return 0;
2358
    }
2359
    aSign = extractFloat32Sign( a );
2360
    bSign = extractFloat32Sign( b );
2361
    av = float32_val(a);
2362
    bv = float32_val(b);
2363
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365

    
2366
}
2367

    
2368
/*----------------------------------------------------------------------------
2369
| Returns 1 if the single-precision floating-point value `a' is less than
2370
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2371
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372
*----------------------------------------------------------------------------*/
2373

    
2374
int float32_lt( float32 a, float32 b STATUS_PARAM )
2375
{
2376
    flag aSign, bSign;
2377
    uint32_t av, bv;
2378
    a = float32_squash_input_denormal(a STATUS_VAR);
2379
    b = float32_squash_input_denormal(b STATUS_VAR);
2380

    
2381
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383
       ) {
2384
        float_raise( float_flag_invalid STATUS_VAR);
2385
        return 0;
2386
    }
2387
    aSign = extractFloat32Sign( a );
2388
    bSign = extractFloat32Sign( b );
2389
    av = float32_val(a);
2390
    bv = float32_val(b);
2391
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393

    
2394
}
2395

    
2396
/*----------------------------------------------------------------------------
2397
| Returns 1 if the single-precision floating-point value `a' is equal to
2398
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2399
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2400
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2401
*----------------------------------------------------------------------------*/
2402

    
2403
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2404
{
2405
    uint32_t av, bv;
2406
    a = float32_squash_input_denormal(a STATUS_VAR);
2407
    b = float32_squash_input_denormal(b STATUS_VAR);
2408

    
2409
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2410
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2411
       ) {
2412
        float_raise( float_flag_invalid STATUS_VAR);
2413
        return 0;
2414
    }
2415
    av = float32_val(a);
2416
    bv = float32_val(b);
2417
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2418

    
2419
}
2420

    
2421
/*----------------------------------------------------------------------------
2422
| Returns 1 if the single-precision floating-point value `a' is less than or
2423
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2424
| cause an exception.  Otherwise, the comparison is performed according to the
2425
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2426
*----------------------------------------------------------------------------*/
2427

    
2428
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2429
{
2430
    flag aSign, bSign;
2431
    uint32_t av, bv;
2432
    a = float32_squash_input_denormal(a STATUS_VAR);
2433
    b = float32_squash_input_denormal(b STATUS_VAR);
2434

    
2435
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2436
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2437
       ) {
2438
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2439
            float_raise( float_flag_invalid STATUS_VAR);
2440
        }
2441
        return 0;
2442
    }
2443
    aSign = extractFloat32Sign( a );
2444
    bSign = extractFloat32Sign( b );
2445
    av = float32_val(a);
2446
    bv = float32_val(b);
2447
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2448
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2449

    
2450
}
2451

    
2452
/*----------------------------------------------------------------------------
2453
| Returns 1 if the single-precision floating-point value `a' is less than
2454
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2455
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2456
| Standard for Binary Floating-Point Arithmetic.
2457
*----------------------------------------------------------------------------*/
2458

    
2459
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2460
{
2461
    flag aSign, bSign;
2462
    uint32_t av, bv;
2463
    a = float32_squash_input_denormal(a STATUS_VAR);
2464
    b = float32_squash_input_denormal(b STATUS_VAR);
2465

    
2466
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2467
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2468
       ) {
2469
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2470
            float_raise( float_flag_invalid STATUS_VAR);
2471
        }
2472
        return 0;
2473
    }
2474
    aSign = extractFloat32Sign( a );
2475
    bSign = extractFloat32Sign( b );
2476
    av = float32_val(a);
2477
    bv = float32_val(b);
2478
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2479
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2480

    
2481
}
2482

    
2483
/*----------------------------------------------------------------------------
2484
| Returns the result of converting the double-precision floating-point value
2485
| `a' to the 32-bit two's complement integer format.  The conversion is
2486
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2487
| Arithmetic---which means in particular that the conversion is rounded
2488
| according to the current rounding mode.  If `a' is a NaN, the largest
2489
| positive integer is returned.  Otherwise, if the conversion overflows, the
2490
| largest integer with the same sign as `a' is returned.
2491
*----------------------------------------------------------------------------*/
2492

    
2493
int32 float64_to_int32( float64 a STATUS_PARAM )
2494
{
2495
    flag aSign;
2496
    int16 aExp, shiftCount;
2497
    uint64_t aSig;
2498
    a = float64_squash_input_denormal(a STATUS_VAR);
2499

    
2500
    aSig = extractFloat64Frac( a );
2501
    aExp = extractFloat64Exp( a );
2502
    aSign = extractFloat64Sign( a );
2503
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2504
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2505
    shiftCount = 0x42C - aExp;
2506
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2507
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2508

    
2509
}
2510

    
2511
/*----------------------------------------------------------------------------
2512
| Returns the result of converting the double-precision floating-point value
2513
| `a' to the 32-bit two's complement integer format.  The conversion is
2514
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2515
| Arithmetic, except that the conversion is always rounded toward zero.
2516
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2517
| the conversion overflows, the largest integer with the same sign as `a' is
2518
| returned.
2519
*----------------------------------------------------------------------------*/
2520

    
2521
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2522
{
2523
    flag aSign;
2524
    int16 aExp, shiftCount;
2525
    uint64_t aSig, savedASig;
2526
    int32 z;
2527
    a = float64_squash_input_denormal(a STATUS_VAR);
2528

    
2529
    aSig = extractFloat64Frac( a );
2530
    aExp = extractFloat64Exp( a );
2531
    aSign = extractFloat64Sign( a );
2532
    if ( 0x41E < aExp ) {
2533
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2534
        goto invalid;
2535
    }
2536
    else if ( aExp < 0x3FF ) {
2537
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2538
        return 0;
2539
    }
2540
    aSig |= LIT64( 0x0010000000000000 );
2541
    shiftCount = 0x433 - aExp;
2542
    savedASig = aSig;
2543
    aSig >>= shiftCount;
2544
    z = aSig;
2545
    if ( aSign ) z = - z;
2546
    if ( ( z < 0 ) ^ aSign ) {
2547
 invalid:
2548
        float_raise( float_flag_invalid STATUS_VAR);
2549
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2550
    }
2551
    if ( ( aSig<<shiftCount ) != savedASig ) {
2552
        STATUS(float_exception_flags) |= float_flag_inexact;
2553
    }
2554
    return z;
2555

    
2556
}
2557

    
2558
/*----------------------------------------------------------------------------
2559
| Returns the result of converting the double-precision floating-point value
2560
| `a' to the 16-bit two's complement integer format.  The conversion is
2561
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2562
| Arithmetic, except that the conversion is always rounded toward zero.
2563
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2564
| the conversion overflows, the largest integer with the same sign as `a' is
2565
| returned.
2566
*----------------------------------------------------------------------------*/
2567

    
2568
int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2569
{
2570
    flag aSign;
2571
    int16 aExp, shiftCount;
2572
    uint64_t aSig, savedASig;
2573
    int32 z;
2574

    
2575
    aSig = extractFloat64Frac( a );
2576
    aExp = extractFloat64Exp( a );
2577
    aSign = extractFloat64Sign( a );
2578
    if ( 0x40E < aExp ) {
2579
        if ( ( aExp == 0x7FF ) && aSig ) {
2580
            aSign = 0;
2581
        }
2582
        goto invalid;
2583
    }
2584
    else if ( aExp < 0x3FF ) {
2585
        if ( aExp || aSig ) {
2586
            STATUS(float_exception_flags) |= float_flag_inexact;
2587
        }
2588
        return 0;
2589
    }
2590
    aSig |= LIT64( 0x0010000000000000 );
2591
    shiftCount = 0x433 - aExp;
2592
    savedASig = aSig;
2593
    aSig >>= shiftCount;
2594
    z = aSig;
2595
    if ( aSign ) {
2596
        z = - z;
2597
    }
2598
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2599
 invalid:
2600
        float_raise( float_flag_invalid STATUS_VAR);
2601
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2602
    }
2603
    if ( ( aSig<<shiftCount ) != savedASig ) {
2604
        STATUS(float_exception_flags) |= float_flag_inexact;
2605
    }
2606
    return z;
2607
}
2608

    
2609
/*----------------------------------------------------------------------------
2610
| Returns the result of converting the double-precision floating-point value
2611
| `a' to the 64-bit two's complement integer format.  The conversion is
2612
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2613
| Arithmetic---which means in particular that the conversion is rounded
2614
| according to the current rounding mode.  If `a' is a NaN, the largest
2615
| positive integer is returned.  Otherwise, if the conversion overflows, the
2616
| largest integer with the same sign as `a' is returned.
2617
*----------------------------------------------------------------------------*/
2618

    
2619
int64 float64_to_int64( float64 a STATUS_PARAM )
2620
{
2621
    flag aSign;
2622
    int16 aExp, shiftCount;
2623
    uint64_t aSig, aSigExtra;
2624
    a = float64_squash_input_denormal(a STATUS_VAR);
2625

    
2626
    aSig = extractFloat64Frac( a );
2627
    aExp = extractFloat64Exp( a );
2628
    aSign = extractFloat64Sign( a );
2629
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2630
    shiftCount = 0x433 - aExp;
2631
    if ( shiftCount <= 0 ) {
2632
        if ( 0x43E < aExp ) {
2633
            float_raise( float_flag_invalid STATUS_VAR);
2634
            if (    ! aSign
2635
                 || (    ( aExp == 0x7FF )
2636
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2637
               ) {
2638
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2639
            }
2640
            return (int64_t) LIT64( 0x8000000000000000 );
2641
        }
2642
        aSigExtra = 0;
2643
        aSig <<= - shiftCount;
2644
    }
2645
    else {
2646
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2647
    }
2648
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2649

    
2650
}
2651

    
2652
/*----------------------------------------------------------------------------
2653
| Returns the result of converting the double-precision floating-point value
2654
| `a' to the 64-bit two's complement integer format.  The conversion is
2655
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2656
| Arithmetic, except that the conversion is always rounded toward zero.
2657
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2658
| the conversion overflows, the largest integer with the same sign as `a' is
2659
| returned.
2660
*----------------------------------------------------------------------------*/
2661

    
2662
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2663
{
2664
    flag aSign;
2665
    int16 aExp, shiftCount;
2666
    uint64_t aSig;
2667
    int64 z;
2668
    a = float64_squash_input_denormal(a STATUS_VAR);
2669

    
2670
    aSig = extractFloat64Frac( a );
2671
    aExp = extractFloat64Exp( a );
2672
    aSign = extractFloat64Sign( a );
2673
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2674
    shiftCount = aExp - 0x433;
2675
    if ( 0 <= shiftCount ) {
2676
        if ( 0x43E <= aExp ) {
2677
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2678
                float_raise( float_flag_invalid STATUS_VAR);
2679
                if (    ! aSign
2680
                     || (    ( aExp == 0x7FF )
2681
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2682
                   ) {
2683
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2684
                }
2685
            }
2686
            return (int64_t) LIT64( 0x8000000000000000 );
2687
        }
2688
        z = aSig<<shiftCount;
2689
    }
2690
    else {
2691
        if ( aExp < 0x3FE ) {
2692
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2693
            return 0;
2694
        }
2695
        z = aSig>>( - shiftCount );
2696
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2697
            STATUS(float_exception_flags) |= float_flag_inexact;
2698
        }
2699
    }
2700
    if ( aSign ) z = - z;
2701
    return z;
2702

    
2703
}
2704

    
2705
/*----------------------------------------------------------------------------
2706
| Returns the result of converting the double-precision floating-point value
2707
| `a' to the single-precision floating-point format.  The conversion is
2708
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2709
| Arithmetic.
2710
*----------------------------------------------------------------------------*/
2711

    
2712
float32 float64_to_float32( float64 a STATUS_PARAM )
2713
{
2714
    flag aSign;
2715
    int16 aExp;
2716
    uint64_t aSig;
2717
    uint32_t zSig;
2718
    a = float64_squash_input_denormal(a STATUS_VAR);
2719

    
2720
    aSig = extractFloat64Frac( a );
2721
    aExp = extractFloat64Exp( a );
2722
    aSign = extractFloat64Sign( a );
2723
    if ( aExp == 0x7FF ) {
2724
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2725
        return packFloat32( aSign, 0xFF, 0 );
2726
    }
2727
    shift64RightJamming( aSig, 22, &aSig );
2728
    zSig = aSig;
2729
    if ( aExp || zSig ) {
2730
        zSig |= 0x40000000;
2731
        aExp -= 0x381;
2732
    }
2733
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2734

    
2735
}
2736

    
2737

    
2738
/*----------------------------------------------------------------------------
2739
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2740
| half-precision floating-point value, returning the result.  After being
2741
| shifted into the proper positions, the three fields are simply added
2742
| together to form the result.  This means that any integer portion of `zSig'
2743
| will be added into the exponent.  Since a properly normalized significand
2744
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2745
| than the desired result exponent whenever `zSig' is a complete, normalized
2746
| significand.
2747
*----------------------------------------------------------------------------*/
2748
static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2749
{
2750
    return make_float16(
2751
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2752
}
2753

    
2754
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2755
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2756

    
2757
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2758
{
2759
    flag aSign;
2760
    int16 aExp;
2761
    uint32_t aSig;
2762

    
2763
    aSign = extractFloat16Sign(a);
2764
    aExp = extractFloat16Exp(a);
2765
    aSig = extractFloat16Frac(a);
2766

    
2767
    if (aExp == 0x1f && ieee) {
2768
        if (aSig) {
2769
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2770
        }
2771
        return packFloat32(aSign, 0xff, aSig << 13);
2772
    }
2773
    if (aExp == 0) {
2774
        int8 shiftCount;
2775

    
2776
        if (aSig == 0) {
2777
            return packFloat32(aSign, 0, 0);
2778
        }
2779

    
2780
        shiftCount = countLeadingZeros32( aSig ) - 21;
2781
        aSig = aSig << shiftCount;
2782
        aExp = -shiftCount;
2783
    }
2784
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2785
}
2786

    
2787
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2788
{
2789
    flag aSign;
2790
    int16 aExp;
2791
    uint32_t aSig;
2792
    uint32_t mask;
2793
    uint32_t increment;
2794
    int8 roundingMode;
2795
    a = float32_squash_input_denormal(a STATUS_VAR);
2796

    
2797
    aSig = extractFloat32Frac( a );
2798
    aExp = extractFloat32Exp( a );
2799
    aSign = extractFloat32Sign( a );
2800
    if ( aExp == 0xFF ) {
2801
        if (aSig) {
2802
            /* Input is a NaN */
2803
            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2804
            if (!ieee) {
2805
                return packFloat16(aSign, 0, 0);
2806
            }
2807
            return r;
2808
        }
2809
        /* Infinity */
2810
        if (!ieee) {
2811
            float_raise(float_flag_invalid STATUS_VAR);
2812
            return packFloat16(aSign, 0x1f, 0x3ff);
2813
        }
2814
        return packFloat16(aSign, 0x1f, 0);
2815
    }
2816
    if (aExp == 0 && aSig == 0) {
2817
        return packFloat16(aSign, 0, 0);
2818
    }
2819
    /* Decimal point between bits 22 and 23.  */
2820
    aSig |= 0x00800000;
2821
    aExp -= 0x7f;
2822
    if (aExp < -14) {
2823
        mask = 0x00ffffff;
2824
        if (aExp >= -24) {
2825
            mask >>= 25 + aExp;
2826
        }
2827
    } else {
2828
        mask = 0x00001fff;
2829
    }
2830
    if (aSig & mask) {
2831
        float_raise( float_flag_underflow STATUS_VAR );
2832
        roundingMode = STATUS(float_rounding_mode);
2833
        switch (roundingMode) {
2834
        case float_round_nearest_even:
2835
            increment = (mask + 1) >> 1;
2836
            if ((aSig & mask) == increment) {
2837
                increment = aSig & (increment << 1);
2838
            }
2839
            break;
2840
        case float_round_up:
2841
            increment = aSign ? 0 : mask;
2842
            break;
2843
        case float_round_down:
2844
            increment = aSign ? mask : 0;
2845
            break;
2846
        default: /* round_to_zero */
2847
            increment = 0;
2848
            break;
2849
        }
2850
        aSig += increment;
2851
        if (aSig >= 0x01000000) {
2852
            aSig >>= 1;
2853
            aExp++;
2854
        }
2855
    } else if (aExp < -14
2856
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2857
        float_raise( float_flag_underflow STATUS_VAR);
2858
    }
2859

    
2860
    if (ieee) {
2861
        if (aExp > 15) {
2862
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2863
            return packFloat16(aSign, 0x1f, 0);
2864
        }
2865
    } else {
2866
        if (aExp > 16) {
2867
            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2868
            return packFloat16(aSign, 0x1f, 0x3ff);
2869
        }
2870
    }
2871
    if (aExp < -24) {
2872
        return packFloat16(aSign, 0, 0);
2873
    }
2874
    if (aExp < -14) {
2875
        aSig >>= -14 - aExp;
2876
        aExp = -14;
2877
    }
2878
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2879
}
2880

    
2881
#ifdef FLOATX80
2882

    
2883
/*----------------------------------------------------------------------------
2884
| Returns the result of converting the double-precision floating-point value
2885
| `a' to the extended double-precision floating-point format.  The conversion
2886
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2887
| Arithmetic.
2888
*----------------------------------------------------------------------------*/
2889

    
2890
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2891
{
2892
    flag aSign;
2893
    int16 aExp;
2894
    uint64_t aSig;
2895

    
2896
    a = float64_squash_input_denormal(a STATUS_VAR);
2897
    aSig = extractFloat64Frac( a );
2898
    aExp = extractFloat64Exp( a );
2899
    aSign = extractFloat64Sign( a );
2900
    if ( aExp == 0x7FF ) {
2901
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2902
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2903
    }
2904
    if ( aExp == 0 ) {
2905
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2906
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2907
    }
2908
    return
2909
        packFloatx80(
2910
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2911

    
2912
}
2913

    
2914
#endif
2915

    
2916
#ifdef FLOAT128
2917

    
2918
/*----------------------------------------------------------------------------
2919
| Returns the result of converting the double-precision floating-point value
2920
| `a' to the quadruple-precision floating-point format.  The conversion is
2921
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2922
| Arithmetic.
2923
*----------------------------------------------------------------------------*/
2924

    
2925
float128 float64_to_float128( float64 a STATUS_PARAM )
2926
{
2927
    flag aSign;
2928
    int16 aExp;
2929
    uint64_t aSig, zSig0, zSig1;
2930

    
2931
    a = float64_squash_input_denormal(a STATUS_VAR);
2932
    aSig = extractFloat64Frac( a );
2933
    aExp = extractFloat64Exp( a );
2934
    aSign = extractFloat64Sign( a );
2935
    if ( aExp == 0x7FF ) {
2936
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2937
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2938
    }
2939
    if ( aExp == 0 ) {
2940
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2941
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2942
        --aExp;
2943
    }
2944
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2945
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2946

    
2947
}
2948

    
2949
#endif
2950

    
2951
/*----------------------------------------------------------------------------
2952
| Rounds the double-precision floating-point value `a' to an integer, and
2953
| returns the result as a double-precision floating-point value.  The
2954
| operation is performed according to the IEC/IEEE Standard for Binary
2955
| Floating-Point Arithmetic.
2956
*----------------------------------------------------------------------------*/
2957

    
2958
float64 float64_round_to_int( float64 a STATUS_PARAM )
2959
{
2960
    flag aSign;
2961
    int16 aExp;
2962
    uint64_t lastBitMask, roundBitsMask;
2963
    int8 roundingMode;
2964
    uint64_t z;
2965
    a = float64_squash_input_denormal(a STATUS_VAR);
2966

    
2967
    aExp = extractFloat64Exp( a );
2968
    if ( 0x433 <= aExp ) {
2969
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2970
            return propagateFloat64NaN( a, a STATUS_VAR );
2971
        }
2972
        return a;
2973
    }
2974
    if ( aExp < 0x3FF ) {
2975
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2976
        STATUS(float_exception_flags) |= float_flag_inexact;
2977
        aSign = extractFloat64Sign( a );
2978
        switch ( STATUS(float_rounding_mode) ) {
2979
         case float_round_nearest_even:
2980
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2981
                return packFloat64( aSign, 0x3FF, 0 );
2982
            }
2983
            break;
2984
         case float_round_down:
2985
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2986
         case float_round_up:
2987
            return make_float64(
2988
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2989
        }
2990
        return packFloat64( aSign, 0, 0 );
2991
    }
2992
    lastBitMask = 1;
2993
    lastBitMask <<= 0x433 - aExp;
2994
    roundBitsMask = lastBitMask - 1;
2995
    z = float64_val(a);
2996
    roundingMode = STATUS(float_rounding_mode);
2997
    if ( roundingMode == float_round_nearest_even ) {
2998
        z += lastBitMask>>1;
2999
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3000
    }
3001
    else if ( roundingMode != float_round_to_zero ) {
3002
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3003
            z += roundBitsMask;
3004
        }
3005
    }
3006
    z &= ~ roundBitsMask;
3007
    if ( z != float64_val(a) )
3008
        STATUS(float_exception_flags) |= float_flag_inexact;
3009
    return make_float64(z);
3010

    
3011
}
3012

    
3013
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3014
{
3015
    int oldmode;
3016
    float64 res;
3017
    oldmode = STATUS(float_rounding_mode);
3018
    STATUS(float_rounding_mode) = float_round_to_zero;
3019
    res = float64_round_to_int(a STATUS_VAR);
3020
    STATUS(float_rounding_mode) = oldmode;
3021
    return res;
3022
}
3023

    
3024
/*----------------------------------------------------------------------------
3025
| Returns the result of adding the absolute values of the double-precision
3026
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3027
| before being returned.  `zSign' is ignored if the result is a NaN.
3028
| The addition is performed according to the IEC/IEEE Standard for Binary
3029
| Floating-Point Arithmetic.
3030
*----------------------------------------------------------------------------*/
3031

    
3032
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3033
{
3034
    int16 aExp, bExp, zExp;
3035
    uint64_t aSig, bSig, zSig;
3036
    int16 expDiff;
3037

    
3038
    aSig = extractFloat64Frac( a );
3039
    aExp = extractFloat64Exp( a );
3040
    bSig = extractFloat64Frac( b );
3041
    bExp = extractFloat64Exp( b );
3042
    expDiff = aExp - bExp;
3043
    aSig <<= 9;
3044
    bSig <<= 9;
3045
    if ( 0 < expDiff ) {
3046
        if ( aExp == 0x7FF ) {
3047
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3048
            return a;
3049
        }
3050
        if ( bExp == 0 ) {
3051
            --expDiff;
3052
        }
3053
        else {
3054
            bSig |= LIT64( 0x2000000000000000 );
3055
        }
3056
        shift64RightJamming( bSig, expDiff, &bSig );
3057
        zExp = aExp;
3058
    }
3059
    else if ( expDiff < 0 ) {
3060
        if ( bExp == 0x7FF ) {
3061
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3062
            return packFloat64( zSign, 0x7FF, 0 );
3063
        }
3064
        if ( aExp == 0 ) {
3065
            ++expDiff;
3066
        }
3067
        else {
3068
            aSig |= LIT64( 0x2000000000000000 );
3069
        }
3070
        shift64RightJamming( aSig, - expDiff, &aSig );
3071
        zExp = bExp;
3072
    }
3073
    else {
3074
        if ( aExp == 0x7FF ) {
3075
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3076
            return a;
3077
        }
3078
        if ( aExp == 0 ) {
3079
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3080
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3081
        }
3082
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3083
        zExp = aExp;
3084
        goto roundAndPack;
3085
    }
3086
    aSig |= LIT64( 0x2000000000000000 );
3087
    zSig = ( aSig + bSig )<<1;
3088
    --zExp;
3089
    if ( (int64_t) zSig < 0 ) {
3090
        zSig = aSig + bSig;
3091
        ++zExp;
3092
    }
3093
 roundAndPack:
3094
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3095

    
3096
}
3097

    
3098
/*----------------------------------------------------------------------------
3099
| Returns the result of subtracting the absolute values of the double-
3100
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3101
| difference is negated before being returned.  `zSign' is ignored if the
3102
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3103
| Standard for Binary Floating-Point Arithmetic.
3104
*----------------------------------------------------------------------------*/
3105

    
3106
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3107
{
3108
    int16 aExp, bExp, zExp;
3109
    uint64_t aSig, bSig, zSig;
3110
    int16 expDiff;
3111

    
3112
    aSig = extractFloat64Frac( a );
3113
    aExp = extractFloat64Exp( a );
3114
    bSig = extractFloat64Frac( b );
3115
    bExp = extractFloat64Exp( b );
3116
    expDiff = aExp - bExp;
3117
    aSig <<= 10;
3118
    bSig <<= 10;
3119
    if ( 0 < expDiff ) goto aExpBigger;
3120
    if ( expDiff < 0 ) goto bExpBigger;
3121
    if ( aExp == 0x7FF ) {
3122
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123
        float_raise( float_flag_invalid STATUS_VAR);
3124
        return float64_default_nan;
3125
    }
3126
    if ( aExp == 0 ) {
3127
        aExp = 1;
3128
        bExp = 1;
3129
    }
3130
    if ( bSig < aSig ) goto aBigger;
3131
    if ( aSig < bSig ) goto bBigger;
3132
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3133
 bExpBigger:
3134
    if ( bExp == 0x7FF ) {
3135
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3136
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3137
    }
3138
    if ( aExp == 0 ) {
3139
        ++expDiff;
3140
    }
3141
    else {
3142
        aSig |= LIT64( 0x4000000000000000 );
3143
    }
3144
    shift64RightJamming( aSig, - expDiff, &aSig );
3145
    bSig |= LIT64( 0x4000000000000000 );
3146
 bBigger:
3147
    zSig = bSig - aSig;
3148
    zExp = bExp;
3149
    zSign ^= 1;
3150
    goto normalizeRoundAndPack;
3151
 aExpBigger:
3152
    if ( aExp == 0x7FF ) {
3153
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3154
        return a;
3155
    }
3156
    if ( bExp == 0 ) {
3157
        --expDiff;
3158
    }
3159
    else {
3160
        bSig |= LIT64( 0x4000000000000000 );
3161
    }
3162
    shift64RightJamming( bSig, expDiff, &bSig );
3163
    aSig |= LIT64( 0x4000000000000000 );
3164
 aBigger:
3165
    zSig = aSig - bSig;
3166
    zExp = aExp;
3167
 normalizeRoundAndPack:
3168
    --zExp;
3169
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3170

    
3171
}
3172

    
3173
/*----------------------------------------------------------------------------
3174
| Returns the result of adding the double-precision floating-point values `a'
3175
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3176
| Binary Floating-Point Arithmetic.
3177
*----------------------------------------------------------------------------*/
3178

    
3179
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3180
{
3181
    flag aSign, bSign;
3182
    a = float64_squash_input_denormal(a STATUS_VAR);
3183
    b = float64_squash_input_denormal(b STATUS_VAR);
3184

    
3185
    aSign = extractFloat64Sign( a );
3186
    bSign = extractFloat64Sign( b );
3187
    if ( aSign == bSign ) {
3188
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3189
    }
3190
    else {
3191
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3192
    }
3193

    
3194
}
3195

    
3196
/*----------------------------------------------------------------------------
3197
| Returns the result of subtracting the double-precision floating-point values
3198
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3199
| for Binary Floating-Point Arithmetic.
3200
*----------------------------------------------------------------------------*/
3201

    
3202
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3203
{
3204
    flag aSign, bSign;
3205
    a = float64_squash_input_denormal(a STATUS_VAR);
3206
    b = float64_squash_input_denormal(b STATUS_VAR);
3207

    
3208
    aSign = extractFloat64Sign( a );
3209
    bSign = extractFloat64Sign( b );
3210
    if ( aSign == bSign ) {
3211
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3212
    }
3213
    else {
3214
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3215
    }
3216

    
3217
}
3218

    
3219
/*----------------------------------------------------------------------------
3220
| Returns the result of multiplying the double-precision floating-point values
3221
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3222
| for Binary Floating-Point Arithmetic.
3223
*----------------------------------------------------------------------------*/
3224

    
3225
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3226
{
3227
    flag aSign, bSign, zSign;
3228
    int16 aExp, bExp, zExp;
3229
    uint64_t aSig, bSig, zSig0, zSig1;
3230

    
3231
    a = float64_squash_input_denormal(a STATUS_VAR);
3232
    b = float64_squash_input_denormal(b STATUS_VAR);
3233

    
3234
    aSig = extractFloat64Frac( a );
3235
    aExp = extractFloat64Exp( a );
3236
    aSign = extractFloat64Sign( a );
3237
    bSig = extractFloat64Frac( b );
3238
    bExp = extractFloat64Exp( b );
3239
    bSign = extractFloat64Sign( b );
3240
    zSign = aSign ^ bSign;
3241
    if ( aExp == 0x7FF ) {
3242
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3243
            return propagateFloat64NaN( a, b STATUS_VAR );
3244
        }
3245
        if ( ( bExp | bSig ) == 0 ) {
3246
            float_raise( float_flag_invalid STATUS_VAR);
3247
            return float64_default_nan;
3248
        }
3249
        return packFloat64( zSign, 0x7FF, 0 );
3250
    }
3251
    if ( bExp == 0x7FF ) {
3252
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3253
        if ( ( aExp | aSig ) == 0 ) {
3254
            float_raise( float_flag_invalid STATUS_VAR);
3255
            return float64_default_nan;
3256
        }
3257
        return packFloat64( zSign, 0x7FF, 0 );
3258
    }
3259
    if ( aExp == 0 ) {
3260
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3261
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3262
    }
3263
    if ( bExp == 0 ) {
3264
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3265
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3266
    }
3267
    zExp = aExp + bExp - 0x3FF;
3268
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3269
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3270
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3271
    zSig0 |= ( zSig1 != 0 );
3272
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3273
        zSig0 <<= 1;
3274
        --zExp;
3275
    }
3276
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3277

    
3278
}
3279

    
3280
/*----------------------------------------------------------------------------
3281
| Returns the result of dividing the double-precision floating-point value `a'
3282
| by the corresponding value `b'.  The operation is performed according to
3283
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3284
*----------------------------------------------------------------------------*/
3285

    
3286
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3287
{
3288
    flag aSign, bSign, zSign;
3289
    int16 aExp, bExp, zExp;
3290
    uint64_t aSig, bSig, zSig;
3291
    uint64_t rem0, rem1;
3292
    uint64_t term0, term1;
3293
    a = float64_squash_input_denormal(a STATUS_VAR);
3294
    b = float64_squash_input_denormal(b STATUS_VAR);
3295

    
3296
    aSig = extractFloat64Frac( a );
3297
    aExp = extractFloat64Exp( a );
3298
    aSign = extractFloat64Sign( a );
3299
    bSig = extractFloat64Frac( b );
3300
    bExp = extractFloat64Exp( b );
3301
    bSign = extractFloat64Sign( b );
3302
    zSign = aSign ^ bSign;
3303
    if ( aExp == 0x7FF ) {
3304
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3305
        if ( bExp == 0x7FF ) {
3306
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307
            float_raise( float_flag_invalid STATUS_VAR);
3308
            return float64_default_nan;
3309
        }
3310
        return packFloat64( zSign, 0x7FF, 0 );
3311
    }
3312
    if ( bExp == 0x7FF ) {
3313
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3314
        return packFloat64( zSign, 0, 0 );
3315
    }
3316
    if ( bExp == 0 ) {
3317
        if ( bSig == 0 ) {
3318
            if ( ( aExp | aSig ) == 0 ) {
3319
                float_raise( float_flag_invalid STATUS_VAR);
3320
                return float64_default_nan;
3321
            }
3322
            float_raise( float_flag_divbyzero STATUS_VAR);
3323
            return packFloat64( zSign, 0x7FF, 0 );
3324
        }
3325
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3326
    }
3327
    if ( aExp == 0 ) {
3328
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3329
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3330
    }
3331
    zExp = aExp - bExp + 0x3FD;
3332
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3333
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3334
    if ( bSig <= ( aSig + aSig ) ) {
3335
        aSig >>= 1;
3336
        ++zExp;
3337
    }
3338
    zSig = estimateDiv128To64( aSig, 0, bSig );
3339
    if ( ( zSig & 0x1FF ) <= 2 ) {
3340
        mul64To128( bSig, zSig, &term0, &term1 );
3341
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3342
        while ( (int64_t) rem0 < 0 ) {
3343
            --zSig;
3344
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3345
        }
3346
        zSig |= ( rem1 != 0 );
3347
    }
3348
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3349

    
3350
}
3351

    
3352
/*----------------------------------------------------------------------------
3353
| Returns the remainder of the double-precision floating-point value `a'
3354
| with respect to the corresponding value `b'.  The operation is performed
3355
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3356
*----------------------------------------------------------------------------*/
3357

    
3358
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3359
{
3360
    flag aSign, zSign;
3361
    int16 aExp, bExp, expDiff;
3362
    uint64_t aSig, bSig;
3363
    uint64_t q, alternateASig;
3364
    int64_t sigMean;
3365

    
3366
    a = float64_squash_input_denormal(a STATUS_VAR);
3367
    b = float64_squash_input_denormal(b STATUS_VAR);
3368
    aSig = extractFloat64Frac( a );
3369
    aExp = extractFloat64Exp( a );
3370
    aSign = extractFloat64Sign( a );
3371
    bSig = extractFloat64Frac( b );
3372
    bExp = extractFloat64Exp( b );
3373
    if ( aExp == 0x7FF ) {
3374
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3375
            return propagateFloat64NaN( a, b STATUS_VAR );
3376
        }
3377
        float_raise( float_flag_invalid STATUS_VAR);
3378
        return float64_default_nan;
3379
    }
3380
    if ( bExp == 0x7FF ) {
3381
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3382
        return a;
3383
    }
3384
    if ( bExp == 0 ) {
3385
        if ( bSig == 0 ) {
3386
            float_raise( float_flag_invalid STATUS_VAR);
3387
            return float64_default_nan;
3388
        }
3389
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3390
    }
3391
    if ( aExp == 0 ) {
3392
        if ( aSig == 0 ) return a;
3393
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3394
    }
3395
    expDiff = aExp - bExp;
3396
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3397
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3398
    if ( expDiff < 0 ) {
3399
        if ( expDiff < -1 ) return a;
3400
        aSig >>= 1;
3401
    }
3402
    q = ( bSig <= aSig );
3403
    if ( q ) aSig -= bSig;
3404
    expDiff -= 64;
3405
    while ( 0 < expDiff ) {
3406
        q = estimateDiv128To64( aSig, 0, bSig );
3407
        q = ( 2 < q ) ? q - 2 : 0;
3408
        aSig = - ( ( bSig>>2 ) * q );
3409
        expDiff -= 62;
3410
    }
3411
    expDiff += 64;
3412
    if ( 0 < expDiff ) {
3413
        q = estimateDiv128To64( aSig, 0, bSig );
3414
        q = ( 2 < q ) ? q - 2 : 0;
3415
        q >>= 64 - expDiff;
3416
        bSig >>= 2;
3417
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3418
    }
3419
    else {
3420
        aSig >>= 2;
3421
        bSig >>= 2;
3422
    }
3423
    do {
3424
        alternateASig = aSig;
3425
        ++q;
3426
        aSig -= bSig;
3427
    } while ( 0 <= (int64_t) aSig );
3428
    sigMean = aSig + alternateASig;
3429
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3430
        aSig = alternateASig;
3431
    }
3432
    zSign = ( (int64_t) aSig < 0 );
3433
    if ( zSign ) aSig = - aSig;
3434
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3435

    
3436
}
3437

    
3438
/*----------------------------------------------------------------------------
3439
| Returns the square root of the double-precision floating-point value `a'.
3440
| The operation is performed according to the IEC/IEEE Standard for Binary
3441
| Floating-Point Arithmetic.
3442
*----------------------------------------------------------------------------*/
3443

    
3444
float64 float64_sqrt( float64 a STATUS_PARAM )
3445
{
3446
    flag aSign;
3447
    int16 aExp, zExp;
3448
    uint64_t aSig, zSig, doubleZSig;
3449
    uint64_t rem0, rem1, term0, term1;
3450
    a = float64_squash_input_denormal(a STATUS_VAR);
3451

    
3452
    aSig = extractFloat64Frac( a );
3453
    aExp = extractFloat64Exp( a );
3454
    aSign = extractFloat64Sign( a );
3455
    if ( aExp == 0x7FF ) {
3456
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3457
        if ( ! aSign ) return a;
3458
        float_raise( float_flag_invalid STATUS_VAR);
3459
        return float64_default_nan;
3460
    }
3461
    if ( aSign ) {
3462
        if ( ( aExp | aSig ) == 0 ) return a;
3463
        float_raise( float_flag_invalid STATUS_VAR);
3464
        return float64_default_nan;
3465
    }
3466
    if ( aExp == 0 ) {
3467
        if ( aSig == 0 ) return float64_zero;
3468
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3469
    }
3470
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3471
    aSig |= LIT64( 0x0010000000000000 );
3472
    zSig = estimateSqrt32( aExp, aSig>>21 );
3473
    aSig <<= 9 - ( aExp & 1 );
3474
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3475
    if ( ( zSig & 0x1FF ) <= 5 ) {
3476
        doubleZSig = zSig<<1;
3477
        mul64To128( zSig, zSig, &term0, &term1 );
3478
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3479
        while ( (int64_t) rem0 < 0 ) {
3480
            --zSig;
3481
            doubleZSig -= 2;
3482
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3483
        }
3484
        zSig |= ( ( rem0 | rem1 ) != 0 );
3485
    }
3486
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3487

    
3488
}
3489

    
3490
/*----------------------------------------------------------------------------
3491
| Returns the binary log of the double-precision floating-point value `a'.
3492
| The operation is performed according to the IEC/IEEE Standard for Binary
3493
| Floating-Point Arithmetic.
3494
*----------------------------------------------------------------------------*/
3495
float64 float64_log2( float64 a STATUS_PARAM )
3496
{
3497
    flag aSign, zSign;
3498
    int16 aExp;
3499
    uint64_t aSig, aSig0, aSig1, zSig, i;
3500
    a = float64_squash_input_denormal(a STATUS_VAR);
3501

    
3502
    aSig = extractFloat64Frac( a );
3503
    aExp = extractFloat64Exp( a );
3504
    aSign = extractFloat64Sign( a );
3505

    
3506
    if ( aExp == 0 ) {
3507
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3508
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3509
    }
3510
    if ( aSign ) {
3511
        float_raise( float_flag_invalid STATUS_VAR);
3512
        return float64_default_nan;
3513
    }
3514
    if ( aExp == 0x7FF ) {
3515
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3516
        return a;
3517
    }
3518

    
3519
    aExp -= 0x3FF;
3520
    aSig |= LIT64( 0x0010000000000000 );
3521
    zSign = aExp < 0;
3522
    zSig = (uint64_t)aExp << 52;
3523
    for (i = 1LL << 51; i > 0; i >>= 1) {
3524
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3525
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3526
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3527
            aSig >>= 1;
3528
            zSig |= i;
3529
        }
3530
    }
3531

    
3532
    if ( zSign )
3533
        zSig = -zSig;
3534
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3535
}
3536

    
3537
/*----------------------------------------------------------------------------
3538
| Returns 1 if the double-precision floating-point value `a' is equal to the
3539
| corresponding value `b', and 0 otherwise.  The comparison is performed
3540
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3541
*----------------------------------------------------------------------------*/
3542

    
3543
int float64_eq( float64 a, float64 b STATUS_PARAM )
3544
{
3545
    uint64_t av, bv;
3546
    a = float64_squash_input_denormal(a STATUS_VAR);
3547
    b = float64_squash_input_denormal(b STATUS_VAR);
3548

    
3549
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3550
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3551
       ) {
3552
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3553
            float_raise( float_flag_invalid STATUS_VAR);
3554
        }
3555
        return 0;
3556
    }
3557
    av = float64_val(a);
3558
    bv = float64_val(b);
3559
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3560

    
3561
}
3562

    
3563
/*----------------------------------------------------------------------------
3564
| Returns 1 if the double-precision floating-point value `a' is less than or
3565
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3566
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3567
| Arithmetic.
3568
*----------------------------------------------------------------------------*/
3569

    
3570
int float64_le( float64 a, float64 b STATUS_PARAM )
3571
{
3572
    flag aSign, bSign;
3573
    uint64_t av, bv;
3574
    a = float64_squash_input_denormal(a STATUS_VAR);
3575
    b = float64_squash_input_denormal(b STATUS_VAR);
3576

    
3577
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3578
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3579
       ) {
3580
        float_raise( float_flag_invalid STATUS_VAR);
3581
        return 0;
3582
    }
3583
    aSign = extractFloat64Sign( a );
3584
    bSign = extractFloat64Sign( b );
3585
    av = float64_val(a);
3586
    bv = float64_val(b);
3587
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3588
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3589

    
3590
}
3591

    
3592
/*----------------------------------------------------------------------------
3593
| Returns 1 if the double-precision floating-point value `a' is less than
3594
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3595
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3596
*----------------------------------------------------------------------------*/
3597

    
3598
int float64_lt( float64 a, float64 b STATUS_PARAM )
3599
{
3600
    flag aSign, bSign;
3601
    uint64_t av, bv;
3602

    
3603
    a = float64_squash_input_denormal(a STATUS_VAR);
3604
    b = float64_squash_input_denormal(b STATUS_VAR);
3605
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3606
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3607
       ) {
3608
        float_raise( float_flag_invalid STATUS_VAR);
3609
        return 0;
3610
    }
3611
    aSign = extractFloat64Sign( a );
3612
    bSign = extractFloat64Sign( b );
3613
    av = float64_val(a);
3614
    bv = float64_val(b);
3615
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3616
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3617

    
3618
}
3619

    
3620
/*----------------------------------------------------------------------------
3621
| Returns 1 if the double-precision floating-point value `a' is equal to the
3622
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3623
| if either operand is a NaN.  Otherwise, the comparison is performed
3624
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3625
*----------------------------------------------------------------------------*/
3626

    
3627
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3628
{
3629
    uint64_t av, bv;
3630
    a = float64_squash_input_denormal(a STATUS_VAR);
3631
    b = float64_squash_input_denormal(b STATUS_VAR);
3632

    
3633
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3634
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3635
       ) {
3636
        float_raise( float_flag_invalid STATUS_VAR);
3637
        return 0;
3638
    }
3639
    av = float64_val(a);
3640
    bv = float64_val(b);
3641
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3642

    
3643
}
3644

    
3645
/*----------------------------------------------------------------------------
3646
| Returns 1 if the double-precision floating-point value `a' is less than or
3647
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3648
| cause an exception.  Otherwise, the comparison is performed according to the
3649
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3650
*----------------------------------------------------------------------------*/
3651

    
3652
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3653
{
3654
    flag aSign, bSign;
3655
    uint64_t av, bv;
3656
    a = float64_squash_input_denormal(a STATUS_VAR);
3657
    b = float64_squash_input_denormal(b STATUS_VAR);
3658

    
3659
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3660
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3661
       ) {
3662
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3663
            float_raise( float_flag_invalid STATUS_VAR);
3664
        }
3665
        return 0;
3666
    }
3667
    aSign = extractFloat64Sign( a );
3668
    bSign = extractFloat64Sign( b );
3669
    av = float64_val(a);
3670
    bv = float64_val(b);
3671
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3672
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3673

    
3674
}
3675

    
3676
/*----------------------------------------------------------------------------
3677
| Returns 1 if the double-precision floating-point value `a' is less than
3678
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3679
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3680
| Standard for Binary Floating-Point Arithmetic.
3681
*----------------------------------------------------------------------------*/
3682

    
3683
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3684
{
3685
    flag aSign, bSign;
3686
    uint64_t av, bv;
3687
    a = float64_squash_input_denormal(a STATUS_VAR);
3688
    b = float64_squash_input_denormal(b STATUS_VAR);
3689

    
3690
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3691
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3692
       ) {
3693
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3694
            float_raise( float_flag_invalid STATUS_VAR);
3695
        }
3696
        return 0;
3697
    }
3698
    aSign = extractFloat64Sign( a );
3699
    bSign = extractFloat64Sign( b );
3700
    av = float64_val(a);
3701
    bv = float64_val(b);
3702
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3703
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3704

    
3705
}
3706

    
3707
#ifdef FLOATX80
3708

    
3709
/*----------------------------------------------------------------------------
3710
| Returns the result of converting the extended double-precision floating-
3711
| point value `a' to the 32-bit two's complement integer format.  The
3712
| conversion is performed according to the IEC/IEEE Standard for Binary
3713
| Floating-Point Arithmetic---which means in particular that the conversion
3714
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3715
| largest positive integer is returned.  Otherwise, if the conversion
3716
| overflows, the largest integer with the same sign as `a' is returned.
3717
*----------------------------------------------------------------------------*/
3718

    
3719
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3720
{
3721
    flag aSign;
3722
    int32 aExp, shiftCount;
3723
    uint64_t aSig;
3724

    
3725
    aSig = extractFloatx80Frac( a );
3726
    aExp = extractFloatx80Exp( a );
3727
    aSign = extractFloatx80Sign( a );
3728
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3729
    shiftCount = 0x4037 - aExp;
3730
    if ( shiftCount <= 0 ) shiftCount = 1;
3731
    shift64RightJamming( aSig, shiftCount, &aSig );
3732
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3733

    
3734
}
3735

    
3736
/*----------------------------------------------------------------------------
3737
| Returns the result of converting the extended double-precision floating-
3738
| point value `a' to the 32-bit two's complement integer format.  The
3739
| conversion is performed according to the IEC/IEEE Standard for Binary
3740
| Floating-Point Arithmetic, except that the conversion is always rounded
3741
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3742
| Otherwise, if the conversion overflows, the largest integer with the same
3743
| sign as `a' is returned.
3744
*----------------------------------------------------------------------------*/
3745

    
3746
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3747
{
3748
    flag aSign;
3749
    int32 aExp, shiftCount;
3750
    uint64_t aSig, savedASig;
3751
    int32 z;
3752

    
3753
    aSig = extractFloatx80Frac( a );
3754
    aExp = extractFloatx80Exp( a );
3755
    aSign = extractFloatx80Sign( a );
3756
    if ( 0x401E < aExp ) {
3757
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3758
        goto invalid;
3759
    }
3760
    else if ( aExp < 0x3FFF ) {
3761
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3762
        return 0;
3763
    }
3764
    shiftCount = 0x403E - aExp;
3765
    savedASig = aSig;
3766
    aSig >>= shiftCount;
3767
    z = aSig;
3768
    if ( aSign ) z = - z;
3769
    if ( ( z < 0 ) ^ aSign ) {
3770
 invalid:
3771
        float_raise( float_flag_invalid STATUS_VAR);
3772
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3773
    }
3774
    if ( ( aSig<<shiftCount ) != savedASig ) {
3775
        STATUS(float_exception_flags) |= float_flag_inexact;
3776
    }
3777
    return z;
3778

    
3779
}
3780

    
3781
/*----------------------------------------------------------------------------
3782
| Returns the result of converting the extended double-precision floating-
3783
| point value `a' to the 64-bit two's complement integer format.  The
3784
| conversion is performed according to the IEC/IEEE Standard for Binary
3785
| Floating-Point Arithmetic---which means in particular that the conversion
3786
| is rounded according to the current rounding mode.  If `a' is a NaN,
3787
| the largest positive integer is returned.  Otherwise, if the conversion
3788
| overflows, the largest integer with the same sign as `a' is returned.
3789
*----------------------------------------------------------------------------*/
3790

    
3791
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3792
{
3793
    flag aSign;
3794
    int32 aExp, shiftCount;
3795
    uint64_t aSig, aSigExtra;
3796

    
3797
    aSig = extractFloatx80Frac( a );
3798
    aExp = extractFloatx80Exp( a );
3799
    aSign = extractFloatx80Sign( a );
3800
    shiftCount = 0x403E - aExp;
3801
    if ( shiftCount <= 0 ) {
3802
        if ( shiftCount ) {
3803
            float_raise( float_flag_invalid STATUS_VAR);
3804
            if (    ! aSign
3805
                 || (    ( aExp == 0x7FFF )
3806
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3807
               ) {
3808
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3809
            }
3810
            return (int64_t) LIT64( 0x8000000000000000 );
3811
        }
3812
        aSigExtra = 0;
3813
    }
3814
    else {
3815
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3816
    }
3817
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3818

    
3819
}
3820

    
3821
/*----------------------------------------------------------------------------
3822
| Returns the result of converting the extended double-precision floating-
3823
| point value `a' to the 64-bit two's complement integer format.  The
3824
| conversion is performed according to the IEC/IEEE Standard for Binary
3825
| Floating-Point Arithmetic, except that the conversion is always rounded
3826
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3827
| Otherwise, if the conversion overflows, the largest integer with the same
3828
| sign as `a' is returned.
3829
*----------------------------------------------------------------------------*/
3830

    
3831
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3832
{
3833
    flag aSign;
3834
    int32 aExp, shiftCount;
3835
    uint64_t aSig;
3836
    int64 z;
3837

    
3838
    aSig = extractFloatx80Frac( a );
3839
    aExp = extractFloatx80Exp( a );
3840
    aSign = extractFloatx80Sign( a );
3841
    shiftCount = aExp - 0x403E;
3842
    if ( 0 <= shiftCount ) {
3843
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3844
        if ( ( a.high != 0xC03E ) || aSig ) {
3845
            float_raise( float_flag_invalid STATUS_VAR);
3846
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3847
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3848
            }
3849
        }
3850
        return (int64_t) LIT64( 0x8000000000000000 );
3851
    }
3852
    else if ( aExp < 0x3FFF ) {
3853
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3854
        return 0;
3855
    }
3856
    z = aSig>>( - shiftCount );
3857
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3858
        STATUS(float_exception_flags) |= float_flag_inexact;
3859
    }
3860
    if ( aSign ) z = - z;
3861
    return z;
3862

    
3863
}
3864

    
3865
/*----------------------------------------------------------------------------
3866
| Returns the result of converting the extended double-precision floating-
3867
| point value `a' to the single-precision floating-point format.  The
3868
| conversion is performed according to the IEC/IEEE Standard for Binary
3869
| Floating-Point Arithmetic.
3870
*----------------------------------------------------------------------------*/
3871

    
3872
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3873
{
3874
    flag aSign;
3875
    int32 aExp;
3876
    uint64_t aSig;
3877

    
3878
    aSig = extractFloatx80Frac( a );
3879
    aExp = extractFloatx80Exp( a );
3880
    aSign = extractFloatx80Sign( a );
3881
    if ( aExp == 0x7FFF ) {
3882
        if ( (uint64_t) ( aSig<<1 ) ) {
3883
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3884
        }
3885
        return packFloat32( aSign, 0xFF, 0 );
3886
    }
3887
    shift64RightJamming( aSig, 33, &aSig );
3888
    if ( aExp || aSig ) aExp -= 0x3F81;
3889
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3890

    
3891
}
3892

    
3893
/*----------------------------------------------------------------------------
3894
| Returns the result of converting the extended double-precision floating-
3895
| point value `a' to the double-precision floating-point format.  The
3896
| conversion is performed according to the IEC/IEEE Standard for Binary
3897
| Floating-Point Arithmetic.
3898
*----------------------------------------------------------------------------*/
3899

    
3900
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3901
{
3902
    flag aSign;
3903
    int32 aExp;
3904
    uint64_t aSig, zSig;
3905

    
3906
    aSig = extractFloatx80Frac( a );
3907
    aExp = extractFloatx80Exp( a );
3908
    aSign = extractFloatx80Sign( a );
3909
    if ( aExp == 0x7FFF ) {
3910
        if ( (uint64_t) ( aSig<<1 ) ) {
3911
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3912
        }
3913
        return packFloat64( aSign, 0x7FF, 0 );
3914
    }
3915
    shift64RightJamming( aSig, 1, &zSig );
3916
    if ( aExp || aSig ) aExp -= 0x3C01;
3917
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3918

    
3919
}
3920

    
3921
#ifdef FLOAT128
3922

    
3923
/*----------------------------------------------------------------------------
3924
| Returns the result of converting the extended double-precision floating-
3925
| point value `a' to the quadruple-precision floating-point format.  The
3926
| conversion is performed according to the IEC/IEEE Standard for Binary
3927
| Floating-Point Arithmetic.
3928
*----------------------------------------------------------------------------*/
3929

    
3930
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3931
{
3932
    flag aSign;
3933
    int16 aExp;
3934
    uint64_t aSig, zSig0, zSig1;
3935

    
3936
    aSig = extractFloatx80Frac( a );
3937
    aExp = extractFloatx80Exp( a );
3938
    aSign = extractFloatx80Sign( a );
3939
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
3940
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3941
    }
3942
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3943
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3944

    
3945
}
3946

    
3947
#endif
3948

    
3949
/*----------------------------------------------------------------------------
3950
| Rounds the extended double-precision floating-point value `a' to an integer,
3951
| and returns the result as an extended quadruple-precision floating-point
3952
| value.  The operation is performed according to the IEC/IEEE Standard for
3953
| Binary Floating-Point Arithmetic.
3954
*----------------------------------------------------------------------------*/
3955

    
3956
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3957
{
3958
    flag aSign;
3959
    int32 aExp;
3960
    uint64_t lastBitMask, roundBitsMask;
3961
    int8 roundingMode;
3962
    floatx80 z;
3963

    
3964
    aExp = extractFloatx80Exp( a );
3965
    if ( 0x403E <= aExp ) {
3966
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
3967
            return propagateFloatx80NaN( a, a STATUS_VAR );
3968
        }
3969
        return a;
3970
    }
3971
    if ( aExp < 0x3FFF ) {
3972
        if (    ( aExp == 0 )
3973
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3974
            return a;
3975
        }
3976
        STATUS(float_exception_flags) |= float_flag_inexact;
3977
        aSign = extractFloatx80Sign( a );
3978
        switch ( STATUS(float_rounding_mode) ) {
3979
         case float_round_nearest_even:
3980
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
3981
               ) {
3982
                return
3983
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3984
            }
3985
            break;
3986
         case float_round_down:
3987
            return
3988
                  aSign ?
3989
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3990
                : packFloatx80( 0, 0, 0 );
3991
         case float_round_up:
3992
            return
3993
                  aSign ? packFloatx80( 1, 0, 0 )
3994
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3995
        }
3996
        return packFloatx80( aSign, 0, 0 );
3997
    }
3998
    lastBitMask = 1;
3999
    lastBitMask <<= 0x403E - aExp;
4000
    roundBitsMask = lastBitMask - 1;
4001
    z = a;
4002
    roundingMode = STATUS(float_rounding_mode);
4003
    if ( roundingMode == float_round_nearest_even ) {
4004
        z.low += lastBitMask>>1;
4005
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4006
    }
4007
    else if ( roundingMode != float_round_to_zero ) {
4008
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4009
            z.low += roundBitsMask;
4010
        }
4011
    }
4012
    z.low &= ~ roundBitsMask;
4013
    if ( z.low == 0 ) {
4014
        ++z.high;
4015
        z.low = LIT64( 0x8000000000000000 );
4016
    }
4017
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4018
    return z;
4019

    
4020
}
4021

    
4022
/*----------------------------------------------------------------------------
4023
| Returns the result of adding the absolute values of the extended double-
4024
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4025
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4026
| The addition is performed according to the IEC/IEEE Standard for Binary
4027
| Floating-Point Arithmetic.
4028
*----------------------------------------------------------------------------*/
4029

    
4030
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4031
{
4032
    int32 aExp, bExp, zExp;
4033
    uint64_t aSig, bSig, zSig0, zSig1;
4034
    int32 expDiff;
4035

    
4036
    aSig = extractFloatx80Frac( a );
4037
    aExp = extractFloatx80Exp( a );
4038
    bSig = extractFloatx80Frac( b );
4039
    bExp = extractFloatx80Exp( b );
4040
    expDiff = aExp - bExp;
4041
    if ( 0 < expDiff ) {
4042
        if ( aExp == 0x7FFF ) {
4043
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4044
            return a;
4045
        }
4046
        if ( bExp == 0 ) --expDiff;
4047
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4048
        zExp = aExp;
4049
    }
4050
    else if ( expDiff < 0 ) {
4051
        if ( bExp == 0x7FFF ) {
4052
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4054
        }
4055
        if ( aExp == 0 ) ++expDiff;
4056
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4057
        zExp = bExp;
4058
    }
4059
    else {
4060
        if ( aExp == 0x7FFF ) {
4061
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4062
                return propagateFloatx80NaN( a, b STATUS_VAR );
4063
            }
4064
            return a;
4065
        }
4066
        zSig1 = 0;
4067
        zSig0 = aSig + bSig;
4068
        if ( aExp == 0 ) {
4069
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4070
            goto roundAndPack;
4071
        }
4072
        zExp = aExp;
4073
        goto shiftRight1;
4074
    }
4075
    zSig0 = aSig + bSig;
4076
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4077
 shiftRight1:
4078
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4079
    zSig0 |= LIT64( 0x8000000000000000 );
4080
    ++zExp;
4081
 roundAndPack:
4082
    return
4083
        roundAndPackFloatx80(
4084
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4085

    
4086
}
4087

    
4088
/*----------------------------------------------------------------------------
4089
| Returns the result of subtracting the absolute values of the extended
4090
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4091
| difference is negated before being returned.  `zSign' is ignored if the
4092
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4093
| Standard for Binary Floating-Point Arithmetic.
4094
*----------------------------------------------------------------------------*/
4095

    
4096
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4097
{
4098
    int32 aExp, bExp, zExp;
4099
    uint64_t aSig, bSig, zSig0, zSig1;
4100
    int32 expDiff;
4101
    floatx80 z;
4102

    
4103
    aSig = extractFloatx80Frac( a );
4104
    aExp = extractFloatx80Exp( a );
4105
    bSig = extractFloatx80Frac( b );
4106
    bExp = extractFloatx80Exp( b );
4107
    expDiff = aExp - bExp;
4108
    if ( 0 < expDiff ) goto aExpBigger;
4109
    if ( expDiff < 0 ) goto bExpBigger;
4110
    if ( aExp == 0x7FFF ) {
4111
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4112
            return propagateFloatx80NaN( a, b STATUS_VAR );
4113
        }
4114
        float_raise( float_flag_invalid STATUS_VAR);
4115
        z.low = floatx80_default_nan_low;
4116
        z.high = floatx80_default_nan_high;
4117
        return z;
4118
    }
4119
    if ( aExp == 0 ) {
4120
        aExp = 1;
4121
        bExp = 1;
4122
    }
4123
    zSig1 = 0;
4124
    if ( bSig < aSig ) goto aBigger;
4125
    if ( aSig < bSig ) goto bBigger;
4126
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4127
 bExpBigger:
4128
    if ( bExp == 0x7FFF ) {
4129
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4130
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4131
    }
4132
    if ( aExp == 0 ) ++expDiff;
4133
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4134
 bBigger:
4135
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4136
    zExp = bExp;
4137
    zSign ^= 1;
4138
    goto normalizeRoundAndPack;
4139
 aExpBigger:
4140
    if ( aExp == 0x7FFF ) {
4141
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4142
        return a;
4143
    }
4144
    if ( bExp == 0 ) --expDiff;
4145
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4146
 aBigger:
4147
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4148
    zExp = aExp;
4149
 normalizeRoundAndPack:
4150
    return
4151
        normalizeRoundAndPackFloatx80(
4152
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4153

    
4154
}
4155

    
4156
/*----------------------------------------------------------------------------
4157
| Returns the result of adding the extended double-precision floating-point
4158
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4159
| Standard for Binary Floating-Point Arithmetic.
4160
*----------------------------------------------------------------------------*/
4161

    
4162
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4163
{
4164
    flag aSign, bSign;
4165

    
4166
    aSign = extractFloatx80Sign( a );
4167
    bSign = extractFloatx80Sign( b );
4168
    if ( aSign == bSign ) {
4169
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4170
    }
4171
    else {
4172
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4173
    }
4174

    
4175
}
4176

    
4177
/*----------------------------------------------------------------------------
4178
| Returns the result of subtracting the extended double-precision floating-
4179
| point values `a' and `b'.  The operation is performed according to the
4180
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181
*----------------------------------------------------------------------------*/
4182

    
4183
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4184
{
4185
    flag aSign, bSign;
4186

    
4187
    aSign = extractFloatx80Sign( a );
4188
    bSign = extractFloatx80Sign( b );
4189
    if ( aSign == bSign ) {
4190
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4191
    }
4192
    else {
4193
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4194
    }
4195

    
4196
}
4197

    
4198
/*----------------------------------------------------------------------------
4199
| Returns the result of multiplying the extended double-precision floating-
4200
| point values `a' and `b'.  The operation is performed according to the
4201
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4202
*----------------------------------------------------------------------------*/
4203

    
4204
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4205
{
4206
    flag aSign, bSign, zSign;
4207
    int32 aExp, bExp, zExp;
4208
    uint64_t aSig, bSig, zSig0, zSig1;
4209
    floatx80 z;
4210

    
4211
    aSig = extractFloatx80Frac( a );
4212
    aExp = extractFloatx80Exp( a );
4213
    aSign = extractFloatx80Sign( a );
4214
    bSig = extractFloatx80Frac( b );
4215
    bExp = extractFloatx80Exp( b );
4216
    bSign = extractFloatx80Sign( b );
4217
    zSign = aSign ^ bSign;
4218
    if ( aExp == 0x7FFF ) {
4219
        if (    (uint64_t) ( aSig<<1 )
4220
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4221
            return propagateFloatx80NaN( a, b STATUS_VAR );
4222
        }
4223
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4224
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4225
    }
4226
    if ( bExp == 0x7FFF ) {
4227
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4228
        if ( ( aExp | aSig ) == 0 ) {
4229
 invalid:
4230
            float_raise( float_flag_invalid STATUS_VAR);
4231
            z.low = floatx80_default_nan_low;
4232
            z.high = floatx80_default_nan_high;
4233
            return z;
4234
        }
4235
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4236
    }
4237
    if ( aExp == 0 ) {
4238
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4239
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4240
    }
4241
    if ( bExp == 0 ) {
4242
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4243
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4244
    }
4245
    zExp = aExp + bExp - 0x3FFE;
4246
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4247
    if ( 0 < (int64_t) zSig0 ) {
4248
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4249
        --zExp;
4250
    }
4251
    return
4252
        roundAndPackFloatx80(
4253
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4254

    
4255
}
4256

    
4257
/*----------------------------------------------------------------------------
4258
| Returns the result of dividing the extended double-precision floating-point
4259
| value `a' by the corresponding value `b'.  The operation is performed
4260
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4261
*----------------------------------------------------------------------------*/
4262

    
4263
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4264
{
4265
    flag aSign, bSign, zSign;
4266
    int32 aExp, bExp, zExp;
4267
    uint64_t aSig, bSig, zSig0, zSig1;
4268
    uint64_t rem0, rem1, rem2, term0, term1, term2;
4269
    floatx80 z;
4270

    
4271
    aSig = extractFloatx80Frac( a );
4272
    aExp = extractFloatx80Exp( a );
4273
    aSign = extractFloatx80Sign( a );
4274
    bSig = extractFloatx80Frac( b );
4275
    bExp = extractFloatx80Exp( b );
4276
    bSign = extractFloatx80Sign( b );
4277
    zSign = aSign ^ bSign;
4278
    if ( aExp == 0x7FFF ) {
4279
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4280
        if ( bExp == 0x7FFF ) {
4281
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4282
            goto invalid;
4283
        }
4284
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4285
    }
4286
    if ( bExp == 0x7FFF ) {
4287
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4288
        return packFloatx80( zSign, 0, 0 );
4289
    }
4290
    if ( bExp == 0 ) {
4291
        if ( bSig == 0 ) {
4292
            if ( ( aExp | aSig ) == 0 ) {
4293
 invalid:
4294
                float_raise( float_flag_invalid STATUS_VAR);
4295
                z.low = floatx80_default_nan_low;
4296
                z.high = floatx80_default_nan_high;
4297
                return z;
4298
            }
4299
            float_raise( float_flag_divbyzero STATUS_VAR);
4300
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4301
        }
4302
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4303
    }
4304
    if ( aExp == 0 ) {
4305
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4306
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4307
    }
4308
    zExp = aExp - bExp + 0x3FFE;
4309
    rem1 = 0;
4310
    if ( bSig <= aSig ) {
4311
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4312
        ++zExp;
4313
    }
4314
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4315
    mul64To128( bSig, zSig0, &term0, &term1 );
4316
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4317
    while ( (int64_t) rem0 < 0 ) {
4318
        --zSig0;
4319
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4320
    }
4321
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4322
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4323
        mul64To128( bSig, zSig1, &term1, &term2 );
4324
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4325
        while ( (int64_t) rem1 < 0 ) {
4326
            --zSig1;
4327
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4328
        }
4329
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4330
    }
4331
    return
4332
        roundAndPackFloatx80(
4333
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4334

    
4335
}
4336

    
4337
/*----------------------------------------------------------------------------
4338
| Returns the remainder of the extended double-precision floating-point value
4339
| `a' with respect to the corresponding value `b'.  The operation is performed
4340
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4341
*----------------------------------------------------------------------------*/
4342

    
4343
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4344
{
4345
    flag aSign, zSign;
4346
    int32 aExp, bExp, expDiff;
4347
    uint64_t aSig0, aSig1, bSig;
4348
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4349
    floatx80 z;
4350

    
4351
    aSig0 = extractFloatx80Frac( a );
4352
    aExp = extractFloatx80Exp( a );
4353
    aSign = extractFloatx80Sign( a );
4354
    bSig = extractFloatx80Frac( b );
4355
    bExp = extractFloatx80Exp( b );
4356
    if ( aExp == 0x7FFF ) {
4357
        if (    (uint64_t) ( aSig0<<1 )
4358
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4359
            return propagateFloatx80NaN( a, b STATUS_VAR );
4360
        }
4361
        goto invalid;
4362
    }
4363
    if ( bExp == 0x7FFF ) {
4364
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4365
        return a;
4366
    }
4367
    if ( bExp == 0 ) {
4368
        if ( bSig == 0 ) {
4369
 invalid:
4370
            float_raise( float_flag_invalid STATUS_VAR);
4371
            z.low = floatx80_default_nan_low;
4372
            z.high = floatx80_default_nan_high;
4373
            return z;
4374
        }
4375
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4376
    }
4377
    if ( aExp == 0 ) {
4378
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4379
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4380
    }
4381
    bSig |= LIT64( 0x8000000000000000 );
4382
    zSign = aSign;
4383
    expDiff = aExp - bExp;
4384
    aSig1 = 0;
4385
    if ( expDiff < 0 ) {
4386
        if ( expDiff < -1 ) return a;
4387
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4388
        expDiff = 0;
4389
    }
4390
    q = ( bSig <= aSig0 );
4391
    if ( q ) aSig0 -= bSig;
4392
    expDiff -= 64;
4393
    while ( 0 < expDiff ) {
4394
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4395
        q = ( 2 < q ) ? q - 2 : 0;
4396
        mul64To128( bSig, q, &term0, &term1 );
4397
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4398
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4399
        expDiff -= 62;
4400
    }
4401
    expDiff += 64;
4402
    if ( 0 < expDiff ) {
4403
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4404
        q = ( 2 < q ) ? q - 2 : 0;
4405
        q >>= 64 - expDiff;
4406
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4407
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4408
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4409
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4410
            ++q;
4411
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4412
        }
4413
    }
4414
    else {
4415
        term1 = 0;
4416
        term0 = bSig;
4417
    }
4418
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4419
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4420
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4421
              && ( q & 1 ) )
4422
       ) {
4423
        aSig0 = alternateASig0;
4424
        aSig1 = alternateASig1;
4425
        zSign = ! zSign;
4426
    }
4427
    return
4428
        normalizeRoundAndPackFloatx80(
4429
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4430

    
4431
}
4432

    
4433
/*----------------------------------------------------------------------------
4434
| Returns the square root of the extended double-precision floating-point
4435
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4436
| for Binary Floating-Point Arithmetic.
4437
*----------------------------------------------------------------------------*/
4438

    
4439
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4440
{
4441
    flag aSign;
4442
    int32 aExp, zExp;
4443
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4444
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4445
    floatx80 z;
4446

    
4447
    aSig0 = extractFloatx80Frac( a );
4448
    aExp = extractFloatx80Exp( a );
4449
    aSign = extractFloatx80Sign( a );
4450
    if ( aExp == 0x7FFF ) {
4451
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4452
        if ( ! aSign ) return a;
4453
        goto invalid;
4454
    }
4455
    if ( aSign ) {
4456
        if ( ( aExp | aSig0 ) == 0 ) return a;
4457
 invalid:
4458
        float_raise( float_flag_invalid STATUS_VAR);
4459
        z.low = floatx80_default_nan_low;
4460
        z.high = floatx80_default_nan_high;
4461
        return z;
4462
    }
4463
    if ( aExp == 0 ) {
4464
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4465
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4466
    }
4467
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4468
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4469
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4470
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4471
    doubleZSig0 = zSig0<<1;
4472
    mul64To128( zSig0, zSig0, &term0, &term1 );
4473
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4474
    while ( (int64_t) rem0 < 0 ) {
4475
        --zSig0;
4476
        doubleZSig0 -= 2;
4477
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4478
    }
4479
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4480
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4481
        if ( zSig1 == 0 ) zSig1 = 1;
4482
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4483
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4484
        mul64To128( zSig1, zSig1, &term2, &term3 );
4485
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4486
        while ( (int64_t) rem1 < 0 ) {
4487
            --zSig1;
4488
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4489
            term3 |= 1;
4490
            term2 |= doubleZSig0;
4491
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4492
        }
4493
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4494
    }
4495
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4496
    zSig0 |= doubleZSig0;
4497
    return
4498
        roundAndPackFloatx80(
4499
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4500

    
4501
}
4502

    
4503
/*----------------------------------------------------------------------------
4504
| Returns 1 if the extended double-precision floating-point value `a' is
4505
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4506
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4507
| Arithmetic.
4508
*----------------------------------------------------------------------------*/
4509

    
4510
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4511
{
4512

    
4513
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4514
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4515
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4516
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4517
       ) {
4518
        if (    floatx80_is_signaling_nan( a )
4519
             || floatx80_is_signaling_nan( b ) ) {
4520
            float_raise( float_flag_invalid STATUS_VAR);
4521
        }
4522
        return 0;
4523
    }
4524
    return
4525
           ( a.low == b.low )
4526
        && (    ( a.high == b.high )
4527
             || (    ( a.low == 0 )
4528
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4529
           );
4530

    
4531
}
4532

    
4533
/*----------------------------------------------------------------------------
4534
| Returns 1 if the extended double-precision floating-point value `a' is
4535
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4536
| comparison is performed according to the IEC/IEEE Standard for Binary
4537
| Floating-Point Arithmetic.
4538
*----------------------------------------------------------------------------*/
4539

    
4540
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4541
{
4542
    flag aSign, bSign;
4543

    
4544
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4545
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4546
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4547
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4548
       ) {
4549
        float_raise( float_flag_invalid STATUS_VAR);
4550
        return 0;
4551
    }
4552
    aSign = extractFloatx80Sign( a );
4553
    bSign = extractFloatx80Sign( b );
4554
    if ( aSign != bSign ) {
4555
        return
4556
               aSign
4557
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4558
                 == 0 );
4559
    }
4560
    return
4561
          aSign ? le128( b.high, b.low, a.high, a.low )
4562
        : le128( a.high, a.low, b.high, b.low );
4563

    
4564
}
4565

    
4566
/*----------------------------------------------------------------------------
4567
| Returns 1 if the extended double-precision floating-point value `a' is
4568
| less than the corresponding value `b', and 0 otherwise.  The comparison
4569
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4570
| Arithmetic.
4571
*----------------------------------------------------------------------------*/
4572

    
4573
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4574
{
4575
    flag aSign, bSign;
4576

    
4577
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4578
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4580
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4581
       ) {
4582
        float_raise( float_flag_invalid STATUS_VAR);
4583
        return 0;
4584
    }
4585
    aSign = extractFloatx80Sign( a );
4586
    bSign = extractFloatx80Sign( b );
4587
    if ( aSign != bSign ) {
4588
        return
4589
               aSign
4590
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4591
                 != 0 );
4592
    }
4593
    return
4594
          aSign ? lt128( b.high, b.low, a.high, a.low )
4595
        : lt128( a.high, a.low, b.high, b.low );
4596

    
4597
}
4598

    
4599
/*----------------------------------------------------------------------------
4600
| Returns 1 if the extended double-precision floating-point value `a' is equal
4601
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4602
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4603
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4604
*----------------------------------------------------------------------------*/
4605

    
4606
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4607
{
4608

    
4609
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4610
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4611
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4612
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4613
       ) {
4614
        float_raise( float_flag_invalid STATUS_VAR);
4615
        return 0;
4616
    }
4617
    return
4618
           ( a.low == b.low )
4619
        && (    ( a.high == b.high )
4620
             || (    ( a.low == 0 )
4621
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4622
           );
4623

    
4624
}
4625

    
4626
/*----------------------------------------------------------------------------
4627
| Returns 1 if the extended double-precision floating-point value `a' is less
4628
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4629
| do not cause an exception.  Otherwise, the comparison is performed according
4630
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4631
*----------------------------------------------------------------------------*/
4632

    
4633
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4634
{
4635
    flag aSign, bSign;
4636

    
4637
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4638
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4639
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4640
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4641
       ) {
4642
        if (    floatx80_is_signaling_nan( a )
4643
             || floatx80_is_signaling_nan( b ) ) {
4644
            float_raise( float_flag_invalid STATUS_VAR);
4645
        }
4646
        return 0;
4647
    }
4648
    aSign = extractFloatx80Sign( a );
4649
    bSign = extractFloatx80Sign( b );
4650
    if ( aSign != bSign ) {
4651
        return
4652
               aSign
4653
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4654
                 == 0 );
4655
    }
4656
    return
4657
          aSign ? le128( b.high, b.low, a.high, a.low )
4658
        : le128( a.high, a.low, b.high, b.low );
4659

    
4660
}
4661

    
4662
/*----------------------------------------------------------------------------
4663
| Returns 1 if the extended double-precision floating-point value `a' is less
4664
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4665
| an exception.  Otherwise, the comparison is performed according to the
4666
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4667
*----------------------------------------------------------------------------*/
4668

    
4669
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4670
{
4671
    flag aSign, bSign;
4672

    
4673
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4674
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4675
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4676
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4677
       ) {
4678
        if (    floatx80_is_signaling_nan( a )
4679
             || floatx80_is_signaling_nan( b ) ) {
4680
            float_raise( float_flag_invalid STATUS_VAR);
4681
        }
4682
        return 0;
4683
    }
4684
    aSign = extractFloatx80Sign( a );
4685
    bSign = extractFloatx80Sign( b );
4686
    if ( aSign != bSign ) {
4687
        return
4688
               aSign
4689
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4690
                 != 0 );
4691
    }
4692
    return
4693
          aSign ? lt128( b.high, b.low, a.high, a.low )
4694
        : lt128( a.high, a.low, b.high, b.low );
4695

    
4696
}
4697

    
4698
#endif
4699

    
4700
#ifdef FLOAT128
4701

    
4702
/*----------------------------------------------------------------------------
4703
| Returns the result of converting the quadruple-precision floating-point
4704
| value `a' to the 32-bit two's complement integer format.  The conversion
4705
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4706
| Arithmetic---which means in particular that the conversion is rounded
4707
| according to the current rounding mode.  If `a' is a NaN, the largest
4708
| positive integer is returned.  Otherwise, if the conversion overflows, the
4709
| largest integer with the same sign as `a' is returned.
4710
*----------------------------------------------------------------------------*/
4711

    
4712
int32 float128_to_int32( float128 a STATUS_PARAM )
4713
{
4714
    flag aSign;
4715
    int32 aExp, shiftCount;
4716
    uint64_t aSig0, aSig1;
4717

    
4718
    aSig1 = extractFloat128Frac1( a );
4719
    aSig0 = extractFloat128Frac0( a );
4720
    aExp = extractFloat128Exp( a );
4721
    aSign = extractFloat128Sign( a );
4722
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4723
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4724
    aSig0 |= ( aSig1 != 0 );
4725
    shiftCount = 0x4028 - aExp;
4726
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4727
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4728

    
4729
}
4730

    
4731
/*----------------------------------------------------------------------------
4732
| Returns the result of converting the quadruple-precision floating-point
4733
| value `a' to the 32-bit two's complement integer format.  The conversion
4734
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4735
| Arithmetic, except that the conversion is always rounded toward zero.  If
4736
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4737
| conversion overflows, the largest integer with the same sign as `a' is
4738
| returned.
4739
*----------------------------------------------------------------------------*/
4740

    
4741
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4742
{
4743
    flag aSign;
4744
    int32 aExp, shiftCount;
4745
    uint64_t aSig0, aSig1, savedASig;
4746
    int32 z;
4747

    
4748
    aSig1 = extractFloat128Frac1( a );
4749
    aSig0 = extractFloat128Frac0( a );
4750
    aExp = extractFloat128Exp( a );
4751
    aSign = extractFloat128Sign( a );
4752
    aSig0 |= ( aSig1 != 0 );
4753
    if ( 0x401E < aExp ) {
4754
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4755
        goto invalid;
4756
    }
4757
    else if ( aExp < 0x3FFF ) {
4758
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4759
        return 0;
4760
    }
4761
    aSig0 |= LIT64( 0x0001000000000000 );
4762
    shiftCount = 0x402F - aExp;
4763
    savedASig = aSig0;
4764
    aSig0 >>= shiftCount;
4765
    z = aSig0;
4766
    if ( aSign ) z = - z;
4767
    if ( ( z < 0 ) ^ aSign ) {
4768
 invalid:
4769
        float_raise( float_flag_invalid STATUS_VAR);
4770
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4771
    }
4772
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4773
        STATUS(float_exception_flags) |= float_flag_inexact;
4774
    }
4775
    return z;
4776

    
4777
}
4778

    
4779
/*----------------------------------------------------------------------------
4780
| Returns the result of converting the quadruple-precision floating-point
4781
| value `a' to the 64-bit two's complement integer format.  The conversion
4782
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4783
| Arithmetic---which means in particular that the conversion is rounded
4784
| according to the current rounding mode.  If `a' is a NaN, the largest
4785
| positive integer is returned.  Otherwise, if the conversion overflows, the
4786
| largest integer with the same sign as `a' is returned.
4787
*----------------------------------------------------------------------------*/
4788

    
4789
int64 float128_to_int64( float128 a STATUS_PARAM )
4790
{
4791
    flag aSign;
4792
    int32 aExp, shiftCount;
4793
    uint64_t aSig0, aSig1;
4794

    
4795
    aSig1 = extractFloat128Frac1( a );
4796
    aSig0 = extractFloat128Frac0( a );
4797
    aExp = extractFloat128Exp( a );
4798
    aSign = extractFloat128Sign( a );
4799
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4800
    shiftCount = 0x402F - aExp;
4801
    if ( shiftCount <= 0 ) {
4802
        if ( 0x403E < aExp ) {
4803
            float_raise( float_flag_invalid STATUS_VAR);
4804
            if (    ! aSign
4805
                 || (    ( aExp == 0x7FFF )
4806
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4807
                    )
4808
               ) {
4809
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4810
            }
4811
            return (int64_t) LIT64( 0x8000000000000000 );
4812
        }
4813
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4814
    }
4815
    else {
4816
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4817
    }
4818
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4819

    
4820
}
4821

    
4822
/*----------------------------------------------------------------------------
4823
| Returns the result of converting the quadruple-precision floating-point
4824
| value `a' to the 64-bit two's complement integer format.  The conversion
4825
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4826
| Arithmetic, except that the conversion is always rounded toward zero.
4827
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4828
| the conversion overflows, the largest integer with the same sign as `a' is
4829
| returned.
4830
*----------------------------------------------------------------------------*/
4831

    
4832
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4833
{
4834
    flag aSign;
4835
    int32 aExp, shiftCount;
4836
    uint64_t aSig0, aSig1;
4837
    int64 z;
4838

    
4839
    aSig1 = extractFloat128Frac1( a );
4840
    aSig0 = extractFloat128Frac0( a );
4841
    aExp = extractFloat128Exp( a );
4842
    aSign = extractFloat128Sign( a );
4843
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4844
    shiftCount = aExp - 0x402F;
4845
    if ( 0 < shiftCount ) {
4846
        if ( 0x403E <= aExp ) {
4847
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4848
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4849
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4850
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4851
            }
4852
            else {
4853
                float_raise( float_flag_invalid STATUS_VAR);
4854
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4855
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4856
                }
4857
            }
4858
            return (int64_t) LIT64( 0x8000000000000000 );
4859
        }
4860
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4861
        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4862
            STATUS(float_exception_flags) |= float_flag_inexact;
4863
        }
4864
    }
4865
    else {
4866
        if ( aExp < 0x3FFF ) {
4867
            if ( aExp | aSig0 | aSig1 ) {
4868
                STATUS(float_exception_flags) |= float_flag_inexact;
4869
            }
4870
            return 0;
4871
        }
4872
        z = aSig0>>( - shiftCount );
4873
        if (    aSig1
4874
             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4875
            STATUS(float_exception_flags) |= float_flag_inexact;
4876
        }
4877
    }
4878
    if ( aSign ) z = - z;
4879
    return z;
4880

    
4881
}
4882

    
4883
/*----------------------------------------------------------------------------
4884
| Returns the result of converting the quadruple-precision floating-point
4885
| value `a' to the single-precision floating-point format.  The conversion
4886
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4887
| Arithmetic.
4888
*----------------------------------------------------------------------------*/
4889

    
4890
float32 float128_to_float32( float128 a STATUS_PARAM )
4891
{
4892
    flag aSign;
4893
    int32 aExp;
4894
    uint64_t aSig0, aSig1;
4895
    uint32_t zSig;
4896

    
4897
    aSig1 = extractFloat128Frac1( a );
4898
    aSig0 = extractFloat128Frac0( a );
4899
    aExp = extractFloat128Exp( a );
4900
    aSign = extractFloat128Sign( a );
4901
    if ( aExp == 0x7FFF ) {
4902
        if ( aSig0 | aSig1 ) {
4903
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4904
        }
4905
        return packFloat32( aSign, 0xFF, 0 );
4906
    }
4907
    aSig0 |= ( aSig1 != 0 );
4908
    shift64RightJamming( aSig0, 18, &aSig0 );
4909
    zSig = aSig0;
4910
    if ( aExp || zSig ) {
4911
        zSig |= 0x40000000;
4912
        aExp -= 0x3F81;
4913
    }
4914
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4915

    
4916
}
4917

    
4918
/*----------------------------------------------------------------------------
4919
| Returns the result of converting the quadruple-precision floating-point
4920
| value `a' to the double-precision floating-point format.  The conversion
4921
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4922
| Arithmetic.
4923
*----------------------------------------------------------------------------*/
4924

    
4925
float64 float128_to_float64( float128 a STATUS_PARAM )
4926
{
4927
    flag aSign;
4928
    int32 aExp;
4929
    uint64_t aSig0, aSig1;
4930

    
4931
    aSig1 = extractFloat128Frac1( a );
4932
    aSig0 = extractFloat128Frac0( a );
4933
    aExp = extractFloat128Exp( a );
4934
    aSign = extractFloat128Sign( a );
4935
    if ( aExp == 0x7FFF ) {
4936
        if ( aSig0 | aSig1 ) {
4937
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4938
        }
4939
        return packFloat64( aSign, 0x7FF, 0 );
4940
    }
4941
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4942
    aSig0 |= ( aSig1 != 0 );
4943
    if ( aExp || aSig0 ) {
4944
        aSig0 |= LIT64( 0x4000000000000000 );
4945
        aExp -= 0x3C01;
4946
    }
4947
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4948

    
4949
}
4950

    
4951
#ifdef FLOATX80
4952

    
4953
/*----------------------------------------------------------------------------
4954
| Returns the result of converting the quadruple-precision floating-point
4955
| value `a' to the extended double-precision floating-point format.  The
4956
| conversion is performed according to the IEC/IEEE Standard for Binary
4957
| Floating-Point Arithmetic.
4958
*----------------------------------------------------------------------------*/
4959

    
4960
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4961
{
4962
    flag aSign;
4963
    int32 aExp;
4964
    uint64_t aSig0, aSig1;
4965

    
4966
    aSig1 = extractFloat128Frac1( a );
4967
    aSig0 = extractFloat128Frac0( a );
4968
    aExp = extractFloat128Exp( a );
4969
    aSign = extractFloat128Sign( a );
4970
    if ( aExp == 0x7FFF ) {
4971
        if ( aSig0 | aSig1 ) {
4972
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4973
        }
4974
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4975
    }
4976
    if ( aExp == 0 ) {
4977
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4978
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4979
    }
4980
    else {
4981
        aSig0 |= LIT64( 0x0001000000000000 );
4982
    }
4983
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4984
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4985

    
4986
}
4987

    
4988
#endif
4989

    
4990
/*----------------------------------------------------------------------------
4991
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4992
| returns the result as a quadruple-precision floating-point value.  The
4993
| operation is performed according to the IEC/IEEE Standard for Binary
4994
| Floating-Point Arithmetic.
4995
*----------------------------------------------------------------------------*/
4996

    
4997
float128 float128_round_to_int( float128 a STATUS_PARAM )
4998
{
4999
    flag aSign;
5000
    int32 aExp;
5001
    uint64_t lastBitMask, roundBitsMask;
5002
    int8 roundingMode;
5003
    float128 z;
5004

    
5005
    aExp = extractFloat128Exp( a );
5006
    if ( 0x402F <= aExp ) {
5007
        if ( 0x406F <= aExp ) {
5008
            if (    ( aExp == 0x7FFF )
5009
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5010
               ) {
5011
                return propagateFloat128NaN( a, a STATUS_VAR );
5012
            }
5013
            return a;
5014
        }
5015
        lastBitMask = 1;
5016
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5017
        roundBitsMask = lastBitMask - 1;
5018
        z = a;
5019
        roundingMode = STATUS(float_rounding_mode);
5020
        if ( roundingMode == float_round_nearest_even ) {
5021
            if ( lastBitMask ) {
5022
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5023
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5024
            }
5025
            else {
5026
                if ( (int64_t) z.low < 0 ) {
5027
                    ++z.high;
5028
                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5029
                }
5030
            }
5031
        }
5032
        else if ( roundingMode != float_round_to_zero ) {
5033
            if (   extractFloat128Sign( z )
5034
                 ^ ( roundingMode == float_round_up ) ) {
5035
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5036
            }
5037
        }
5038
        z.low &= ~ roundBitsMask;
5039
    }
5040
    else {
5041
        if ( aExp < 0x3FFF ) {
5042
            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5043
            STATUS(float_exception_flags) |= float_flag_inexact;
5044
            aSign = extractFloat128Sign( a );
5045
            switch ( STATUS(float_rounding_mode) ) {
5046
             case float_round_nearest_even:
5047
                if (    ( aExp == 0x3FFE )
5048
                     && (   extractFloat128Frac0( a )
5049
                          | extractFloat128Frac1( a ) )
5050
                   ) {
5051
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5052
                }
5053
                break;
5054
             case float_round_down:
5055
                return
5056
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5057
                    : packFloat128( 0, 0, 0, 0 );
5058
             case float_round_up:
5059
                return
5060
                      aSign ? packFloat128( 1, 0, 0, 0 )
5061
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5062
            }
5063
            return packFloat128( aSign, 0, 0, 0 );
5064
        }
5065
        lastBitMask = 1;
5066
        lastBitMask <<= 0x402F - aExp;
5067
        roundBitsMask = lastBitMask - 1;
5068
        z.low = 0;
5069
        z.high = a.high;
5070
        roundingMode = STATUS(float_rounding_mode);
5071
        if ( roundingMode == float_round_nearest_even ) {
5072
            z.high += lastBitMask>>1;
5073
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5074
                z.high &= ~ lastBitMask;
5075
            }
5076
        }
5077
        else if ( roundingMode != float_round_to_zero ) {
5078
            if (   extractFloat128Sign( z )
5079
                 ^ ( roundingMode == float_round_up ) ) {
5080
                z.high |= ( a.low != 0 );
5081
                z.high += roundBitsMask;
5082
            }
5083
        }
5084
        z.high &= ~ roundBitsMask;
5085
    }
5086
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5087
        STATUS(float_exception_flags) |= float_flag_inexact;
5088
    }
5089
    return z;
5090

    
5091
}
5092

    
5093
/*----------------------------------------------------------------------------
5094
| Returns the result of adding the absolute values of the quadruple-precision
5095
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5096
| before being returned.  `zSign' is ignored if the result is a NaN.
5097
| The addition is performed according to the IEC/IEEE Standard for Binary
5098
| Floating-Point Arithmetic.
5099
*----------------------------------------------------------------------------*/
5100

    
5101
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5102
{
5103
    int32 aExp, bExp, zExp;
5104
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5105
    int32 expDiff;
5106

    
5107
    aSig1 = extractFloat128Frac1( a );
5108
    aSig0 = extractFloat128Frac0( a );
5109
    aExp = extractFloat128Exp( a );
5110
    bSig1 = extractFloat128Frac1( b );
5111
    bSig0 = extractFloat128Frac0( b );
5112
    bExp = extractFloat128Exp( b );
5113
    expDiff = aExp - bExp;
5114
    if ( 0 < expDiff ) {
5115
        if ( aExp == 0x7FFF ) {
5116
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5117
            return a;
5118
        }
5119
        if ( bExp == 0 ) {
5120
            --expDiff;
5121
        }
5122
        else {
5123
            bSig0 |= LIT64( 0x0001000000000000 );
5124
        }
5125
        shift128ExtraRightJamming(
5126
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5127
        zExp = aExp;
5128
    }
5129
    else if ( expDiff < 0 ) {
5130
        if ( bExp == 0x7FFF ) {
5131
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5132
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5133
        }
5134
        if ( aExp == 0 ) {
5135
            ++expDiff;
5136
        }
5137
        else {
5138
            aSig0 |= LIT64( 0x0001000000000000 );
5139
        }
5140
        shift128ExtraRightJamming(
5141
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5142
        zExp = bExp;
5143
    }
5144
    else {
5145
        if ( aExp == 0x7FFF ) {
5146
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5147
                return propagateFloat128NaN( a, b STATUS_VAR );
5148
            }
5149
            return a;
5150
        }
5151
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5152
        if ( aExp == 0 ) {
5153
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5154
            return packFloat128( zSign, 0, zSig0, zSig1 );
5155
        }
5156
        zSig2 = 0;
5157
        zSig0 |= LIT64( 0x0002000000000000 );
5158
        zExp = aExp;
5159
        goto shiftRight1;
5160
    }
5161
    aSig0 |= LIT64( 0x0001000000000000 );
5162
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5163
    --zExp;
5164
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5165
    ++zExp;
5166
 shiftRight1:
5167
    shift128ExtraRightJamming(
5168
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5169
 roundAndPack:
5170
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5171

    
5172
}
5173

    
5174
/*----------------------------------------------------------------------------
5175
| Returns the result of subtracting the absolute values of the quadruple-
5176
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5177
| difference is negated before being returned.  `zSign' is ignored if the
5178
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5179
| Standard for Binary Floating-Point Arithmetic.
5180
*----------------------------------------------------------------------------*/
5181

    
5182
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5183
{
5184
    int32 aExp, bExp, zExp;
5185
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5186
    int32 expDiff;
5187
    float128 z;
5188

    
5189
    aSig1 = extractFloat128Frac1( a );
5190
    aSig0 = extractFloat128Frac0( a );
5191
    aExp = extractFloat128Exp( a );
5192
    bSig1 = extractFloat128Frac1( b );
5193
    bSig0 = extractFloat128Frac0( b );
5194
    bExp = extractFloat128Exp( b );
5195
    expDiff = aExp - bExp;
5196
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5197
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5198
    if ( 0 < expDiff ) goto aExpBigger;
5199
    if ( expDiff < 0 ) goto bExpBigger;
5200
    if ( aExp == 0x7FFF ) {
5201
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5202
            return propagateFloat128NaN( a, b STATUS_VAR );
5203
        }
5204
        float_raise( float_flag_invalid STATUS_VAR);
5205
        z.low = float128_default_nan_low;
5206
        z.high = float128_default_nan_high;
5207
        return z;
5208
    }
5209
    if ( aExp == 0 ) {
5210
        aExp = 1;
5211
        bExp = 1;
5212
    }
5213
    if ( bSig0 < aSig0 ) goto aBigger;
5214
    if ( aSig0 < bSig0 ) goto bBigger;
5215
    if ( bSig1 < aSig1 ) goto aBigger;
5216
    if ( aSig1 < bSig1 ) goto bBigger;
5217
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5218
 bExpBigger:
5219
    if ( bExp == 0x7FFF ) {
5220
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5221
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5222
    }
5223
    if ( aExp == 0 ) {
5224
        ++expDiff;
5225
    }
5226
    else {
5227
        aSig0 |= LIT64( 0x4000000000000000 );
5228
    }
5229
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5230
    bSig0 |= LIT64( 0x4000000000000000 );
5231
 bBigger:
5232
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5233
    zExp = bExp;
5234
    zSign ^= 1;
5235
    goto normalizeRoundAndPack;
5236
 aExpBigger:
5237
    if ( aExp == 0x7FFF ) {
5238
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5239
        return a;
5240
    }
5241
    if ( bExp == 0 ) {
5242
        --expDiff;
5243
    }
5244
    else {
5245
        bSig0 |= LIT64( 0x4000000000000000 );
5246
    }
5247
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5248
    aSig0 |= LIT64( 0x4000000000000000 );
5249
 aBigger:
5250
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5251
    zExp = aExp;
5252
 normalizeRoundAndPack:
5253
    --zExp;
5254
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5255

    
5256
}
5257

    
5258
/*----------------------------------------------------------------------------
5259
| Returns the result of adding the quadruple-precision floating-point values
5260
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5261
| for Binary Floating-Point Arithmetic.
5262
*----------------------------------------------------------------------------*/
5263

    
5264
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5265
{
5266
    flag aSign, bSign;
5267

    
5268
    aSign = extractFloat128Sign( a );
5269
    bSign = extractFloat128Sign( b );
5270
    if ( aSign == bSign ) {
5271
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5272
    }
5273
    else {
5274
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5275
    }
5276

    
5277
}
5278

    
5279
/*----------------------------------------------------------------------------
5280
| Returns the result of subtracting the quadruple-precision floating-point
5281
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5282
| Standard for Binary Floating-Point Arithmetic.
5283
*----------------------------------------------------------------------------*/
5284

    
5285
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5286
{
5287
    flag aSign, bSign;
5288

    
5289
    aSign = extractFloat128Sign( a );
5290
    bSign = extractFloat128Sign( b );
5291
    if ( aSign == bSign ) {
5292
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5293
    }
5294
    else {
5295
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5296
    }
5297

    
5298
}
5299

    
5300
/*----------------------------------------------------------------------------
5301
| Returns the result of multiplying the quadruple-precision floating-point
5302
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5303
| Standard for Binary Floating-Point Arithmetic.
5304
*----------------------------------------------------------------------------*/
5305

    
5306
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5307
{
5308
    flag aSign, bSign, zSign;
5309
    int32 aExp, bExp, zExp;
5310
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5311
    float128 z;
5312

    
5313
    aSig1 = extractFloat128Frac1( a );
5314
    aSig0 = extractFloat128Frac0( a );
5315
    aExp = extractFloat128Exp( a );
5316
    aSign = extractFloat128Sign( a );
5317
    bSig1 = extractFloat128Frac1( b );
5318
    bSig0 = extractFloat128Frac0( b );
5319
    bExp = extractFloat128Exp( b );
5320
    bSign = extractFloat128Sign( b );
5321
    zSign = aSign ^ bSign;
5322
    if ( aExp == 0x7FFF ) {
5323
        if (    ( aSig0 | aSig1 )
5324
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5325
            return propagateFloat128NaN( a, b STATUS_VAR );
5326
        }
5327
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5328
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5329
    }
5330
    if ( bExp == 0x7FFF ) {
5331
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5332
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5333
 invalid:
5334
            float_raise( float_flag_invalid STATUS_VAR);
5335
            z.low = float128_default_nan_low;
5336
            z.high = float128_default_nan_high;
5337
            return z;
5338
        }
5339
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5340
    }
5341
    if ( aExp == 0 ) {
5342
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5343
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5344
    }
5345
    if ( bExp == 0 ) {
5346
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5347
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5348
    }
5349
    zExp = aExp + bExp - 0x4000;
5350
    aSig0 |= LIT64( 0x0001000000000000 );
5351
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5352
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5353
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5354
    zSig2 |= ( zSig3 != 0 );
5355
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5356
        shift128ExtraRightJamming(
5357
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5358
        ++zExp;
5359
    }
5360
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5361

    
5362
}
5363

    
5364
/*----------------------------------------------------------------------------
5365
| Returns the result of dividing the quadruple-precision floating-point value
5366
| `a' by the corresponding value `b'.  The operation is performed according to
5367
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368
*----------------------------------------------------------------------------*/
5369

    
5370
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5371
{
5372
    flag aSign, bSign, zSign;
5373
    int32 aExp, bExp, zExp;
5374
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5375
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5376
    float128 z;
5377

    
5378
    aSig1 = extractFloat128Frac1( a );
5379
    aSig0 = extractFloat128Frac0( a );
5380
    aExp = extractFloat128Exp( a );
5381
    aSign = extractFloat128Sign( a );
5382
    bSig1 = extractFloat128Frac1( b );
5383
    bSig0 = extractFloat128Frac0( b );
5384
    bExp = extractFloat128Exp( b );
5385
    bSign = extractFloat128Sign( b );
5386
    zSign = aSign ^ bSign;
5387
    if ( aExp == 0x7FFF ) {
5388
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5389
        if ( bExp == 0x7FFF ) {
5390
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5391
            goto invalid;
5392
        }
5393
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5394
    }
5395
    if ( bExp == 0x7FFF ) {
5396
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5397
        return packFloat128( zSign, 0, 0, 0 );
5398
    }
5399
    if ( bExp == 0 ) {
5400
        if ( ( bSig0 | bSig1 ) == 0 ) {
5401
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5402
 invalid:
5403
                float_raise( float_flag_invalid STATUS_VAR);
5404
                z.low = float128_default_nan_low;
5405
                z.high = float128_default_nan_high;
5406
                return z;
5407
            }
5408
            float_raise( float_flag_divbyzero STATUS_VAR);
5409
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5410
        }
5411
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5412
    }
5413
    if ( aExp == 0 ) {
5414
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5415
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5416
    }
5417
    zExp = aExp - bExp + 0x3FFD;
5418
    shortShift128Left(
5419
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5420
    shortShift128Left(
5421
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5422
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5423
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5424
        ++zExp;
5425
    }
5426
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5427
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5428
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5429
    while ( (int64_t) rem0 < 0 ) {
5430
        --zSig0;
5431
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5432
    }
5433
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5434
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5435
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5436
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5437
        while ( (int64_t) rem1 < 0 ) {
5438
            --zSig1;
5439
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5440
        }
5441
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5442
    }
5443
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5444
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5445

    
5446
}
5447

    
5448
/*----------------------------------------------------------------------------
5449
| Returns the remainder of the quadruple-precision floating-point value `a'
5450
| with respect to the corresponding value `b'.  The operation is performed
5451
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5452
*----------------------------------------------------------------------------*/
5453

    
5454
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5455
{
5456
    flag aSign, zSign;
5457
    int32 aExp, bExp, expDiff;
5458
    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5459
    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5460
    int64_t sigMean0;
5461
    float128 z;
5462

    
5463
    aSig1 = extractFloat128Frac1( a );
5464
    aSig0 = extractFloat128Frac0( a );
5465
    aExp = extractFloat128Exp( a );
5466
    aSign = extractFloat128Sign( a );
5467
    bSig1 = extractFloat128Frac1( b );
5468
    bSig0 = extractFloat128Frac0( b );
5469
    bExp = extractFloat128Exp( b );
5470
    if ( aExp == 0x7FFF ) {
5471
        if (    ( aSig0 | aSig1 )
5472
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5473
            return propagateFloat128NaN( a, b STATUS_VAR );
5474
        }
5475
        goto invalid;
5476
    }
5477
    if ( bExp == 0x7FFF ) {
5478
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5479
        return a;
5480
    }
5481
    if ( bExp == 0 ) {
5482
        if ( ( bSig0 | bSig1 ) == 0 ) {
5483
 invalid:
5484
            float_raise( float_flag_invalid STATUS_VAR);
5485
            z.low = float128_default_nan_low;
5486
            z.high = float128_default_nan_high;
5487
            return z;
5488
        }
5489
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5490
    }
5491
    if ( aExp == 0 ) {
5492
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5493
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5494
    }
5495
    expDiff = aExp - bExp;
5496
    if ( expDiff < -1 ) return a;
5497
    shortShift128Left(
5498
        aSig0 | LIT64( 0x0001000000000000 ),
5499
        aSig1,
5500
        15 - ( expDiff < 0 ),
5501
        &aSig0,
5502
        &aSig1
5503
    );
5504
    shortShift128Left(
5505
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5506
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5507
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5508
    expDiff -= 64;
5509
    while ( 0 < expDiff ) {
5510
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5511
        q = ( 4 < q ) ? q - 4 : 0;
5512
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5513
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5514
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5515
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5516
        expDiff -= 61;
5517
    }
5518
    if ( -64 < expDiff ) {
5519
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5520
        q = ( 4 < q ) ? q - 4 : 0;
5521
        q >>= - expDiff;
5522
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5523
        expDiff += 52;
5524
        if ( expDiff < 0 ) {
5525
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5526
        }
5527
        else {
5528
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5529
        }
5530
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5531
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5532
    }
5533
    else {
5534
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5535
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5536
    }
5537
    do {
5538
        alternateASig0 = aSig0;
5539
        alternateASig1 = aSig1;
5540
        ++q;
5541
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5542
    } while ( 0 <= (int64_t) aSig0 );
5543
    add128(
5544
        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5545
    if (    ( sigMean0 < 0 )
5546
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5547
        aSig0 = alternateASig0;
5548
        aSig1 = alternateASig1;
5549
    }
5550
    zSign = ( (int64_t) aSig0 < 0 );
5551
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5552
    return
5553
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5554

    
5555
}
5556

    
5557
/*----------------------------------------------------------------------------
5558
| Returns the square root of the quadruple-precision floating-point value `a'.
5559
| The operation is performed according to the IEC/IEEE Standard for Binary
5560
| Floating-Point Arithmetic.
5561
*----------------------------------------------------------------------------*/
5562

    
5563
float128 float128_sqrt( float128 a STATUS_PARAM )
5564
{
5565
    flag aSign;
5566
    int32 aExp, zExp;
5567
    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5568
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5569
    float128 z;
5570

    
5571
    aSig1 = extractFloat128Frac1( a );
5572
    aSig0 = extractFloat128Frac0( a );
5573
    aExp = extractFloat128Exp( a );
5574
    aSign = extractFloat128Sign( a );
5575
    if ( aExp == 0x7FFF ) {
5576
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5577
        if ( ! aSign ) return a;
5578
        goto invalid;
5579
    }
5580
    if ( aSign ) {
5581
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5582
 invalid:
5583
        float_raise( float_flag_invalid STATUS_VAR);
5584
        z.low = float128_default_nan_low;
5585
        z.high = float128_default_nan_high;
5586
        return z;
5587
    }
5588
    if ( aExp == 0 ) {
5589
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5590
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5591
    }
5592
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5593
    aSig0 |= LIT64( 0x0001000000000000 );
5594
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5595
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5596
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5597
    doubleZSig0 = zSig0<<1;
5598
    mul64To128( zSig0, zSig0, &term0, &term1 );
5599
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5600
    while ( (int64_t) rem0 < 0 ) {
5601
        --zSig0;
5602
        doubleZSig0 -= 2;
5603
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5604
    }
5605
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5606
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5607
        if ( zSig1 == 0 ) zSig1 = 1;
5608
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5609
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5610
        mul64To128( zSig1, zSig1, &term2, &term3 );
5611
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5612
        while ( (int64_t) rem1 < 0 ) {
5613
            --zSig1;
5614
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5615
            term3 |= 1;
5616
            term2 |= doubleZSig0;
5617
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5618
        }
5619
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5620
    }
5621
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5622
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5623

    
5624
}
5625

    
5626
/*----------------------------------------------------------------------------
5627
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5628
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5629
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5630
*----------------------------------------------------------------------------*/
5631

    
5632
int float128_eq( float128 a, float128 b STATUS_PARAM )
5633
{
5634

    
5635
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5636
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5637
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5638
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5639
       ) {
5640
        if (    float128_is_signaling_nan( a )
5641
             || float128_is_signaling_nan( b ) ) {
5642
            float_raise( float_flag_invalid STATUS_VAR);
5643
        }
5644
        return 0;
5645
    }
5646
    return
5647
           ( a.low == b.low )
5648
        && (    ( a.high == b.high )
5649
             || (    ( a.low == 0 )
5650
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5651
           );
5652

    
5653
}
5654

    
5655
/*----------------------------------------------------------------------------
5656
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5657
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5658
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659
| Arithmetic.
5660
*----------------------------------------------------------------------------*/
5661

    
5662
int float128_le( float128 a, float128 b STATUS_PARAM )
5663
{
5664
    flag aSign, bSign;
5665

    
5666
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5667
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5668
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5669
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5670
       ) {
5671
        float_raise( float_flag_invalid STATUS_VAR);
5672
        return 0;
5673
    }
5674
    aSign = extractFloat128Sign( a );
5675
    bSign = extractFloat128Sign( b );
5676
    if ( aSign != bSign ) {
5677
        return
5678
               aSign
5679
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5680
                 == 0 );
5681
    }
5682
    return
5683
          aSign ? le128( b.high, b.low, a.high, a.low )
5684
        : le128( a.high, a.low, b.high, b.low );
5685

    
5686
}
5687

    
5688
/*----------------------------------------------------------------------------
5689
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5690
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5691
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5692
*----------------------------------------------------------------------------*/
5693

    
5694
int float128_lt( float128 a, float128 b STATUS_PARAM )
5695
{
5696
    flag aSign, bSign;
5697

    
5698
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5699
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5700
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5701
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5702
       ) {
5703
        float_raise( float_flag_invalid STATUS_VAR);
5704
        return 0;
5705
    }
5706
    aSign = extractFloat128Sign( a );
5707
    bSign = extractFloat128Sign( b );
5708
    if ( aSign != bSign ) {
5709
        return
5710
               aSign
5711
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5712
                 != 0 );
5713
    }
5714
    return
5715
          aSign ? lt128( b.high, b.low, a.high, a.low )
5716
        : lt128( a.high, a.low, b.high, b.low );
5717

    
5718
}
5719

    
5720
/*----------------------------------------------------------------------------
5721
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5722
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5723
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5724
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5725
*----------------------------------------------------------------------------*/
5726

    
5727
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5728
{
5729

    
5730
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5731
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5732
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5733
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5734
       ) {
5735
        float_raise( float_flag_invalid STATUS_VAR);
5736
        return 0;
5737
    }
5738
    return
5739
           ( a.low == b.low )
5740
        && (    ( a.high == b.high )
5741
             || (    ( a.low == 0 )
5742
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5743
           );
5744

    
5745
}
5746

    
5747
/*----------------------------------------------------------------------------
5748
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5749
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5750
| cause an exception.  Otherwise, the comparison is performed according to the
5751
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5752
*----------------------------------------------------------------------------*/
5753

    
5754
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5755
{
5756
    flag aSign, bSign;
5757

    
5758
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5759
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5760
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5761
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5762
       ) {
5763
        if (    float128_is_signaling_nan( a )
5764
             || float128_is_signaling_nan( b ) ) {
5765
            float_raise( float_flag_invalid STATUS_VAR);
5766
        }
5767
        return 0;
5768
    }
5769
    aSign = extractFloat128Sign( a );
5770
    bSign = extractFloat128Sign( b );
5771
    if ( aSign != bSign ) {
5772
        return
5773
               aSign
5774
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5775
                 == 0 );
5776
    }
5777
    return
5778
          aSign ? le128( b.high, b.low, a.high, a.low )
5779
        : le128( a.high, a.low, b.high, b.low );
5780

    
5781
}
5782

    
5783
/*----------------------------------------------------------------------------
5784
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5785
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5786
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5787
| Standard for Binary Floating-Point Arithmetic.
5788
*----------------------------------------------------------------------------*/
5789

    
5790
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5791
{
5792
    flag aSign, bSign;
5793

    
5794
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5795
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5796
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5797
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5798
       ) {
5799
        if (    float128_is_signaling_nan( a )
5800
             || float128_is_signaling_nan( b ) ) {
5801
            float_raise( float_flag_invalid STATUS_VAR);
5802
        }
5803
        return 0;
5804
    }
5805
    aSign = extractFloat128Sign( a );
5806
    bSign = extractFloat128Sign( b );
5807
    if ( aSign != bSign ) {
5808
        return
5809
               aSign
5810
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5811
                 != 0 );
5812
    }
5813
    return
5814
          aSign ? lt128( b.high, b.low, a.high, a.low )
5815
        : lt128( a.high, a.low, b.high, b.low );
5816

    
5817
}
5818

    
5819
#endif
5820

    
5821
/* misc functions */
5822
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5823
{
5824
    return int64_to_float32(a STATUS_VAR);
5825
}
5826

    
5827
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5828
{
5829
    return int64_to_float64(a STATUS_VAR);
5830
}
5831

    
5832
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5833
{
5834
    int64_t v;
5835
    unsigned int res;
5836

    
5837
    v = float32_to_int64(a STATUS_VAR);
5838
    if (v < 0) {
5839
        res = 0;
5840
        float_raise( float_flag_invalid STATUS_VAR);
5841
    } else if (v > 0xffffffff) {
5842
        res = 0xffffffff;
5843
        float_raise( float_flag_invalid STATUS_VAR);
5844
    } else {
5845
        res = v;
5846
    }
5847
    return res;
5848
}
5849

    
5850
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5851
{
5852
    int64_t v;
5853
    unsigned int res;
5854

    
5855
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5856
    if (v < 0) {
5857
        res = 0;
5858
        float_raise( float_flag_invalid STATUS_VAR);
5859
    } else if (v > 0xffffffff) {
5860
        res = 0xffffffff;
5861
        float_raise( float_flag_invalid STATUS_VAR);
5862
    } else {
5863
        res = v;
5864
    }
5865
    return res;
5866
}
5867

    
5868
unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5869
{
5870
    int64_t v;
5871
    unsigned int res;
5872

    
5873
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5874
    if (v < 0) {
5875
        res = 0;
5876
        float_raise( float_flag_invalid STATUS_VAR);
5877
    } else if (v > 0xffff) {
5878
        res = 0xffff;
5879
        float_raise( float_flag_invalid STATUS_VAR);
5880
    } else {
5881
        res = v;
5882
    }
5883
    return res;
5884
}
5885

    
5886
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5887
{
5888
    int64_t v;
5889
    unsigned int res;
5890

    
5891
    v = float64_to_int64(a STATUS_VAR);
5892
    if (v < 0) {
5893
        res = 0;
5894
        float_raise( float_flag_invalid STATUS_VAR);
5895
    } else if (v > 0xffffffff) {
5896
        res = 0xffffffff;
5897
        float_raise( float_flag_invalid STATUS_VAR);
5898
    } else {
5899
        res = v;
5900
    }
5901
    return res;
5902
}
5903

    
5904
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5905
{
5906
    int64_t v;
5907
    unsigned int res;
5908

    
5909
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5910
    if (v < 0) {
5911
        res = 0;
5912
        float_raise( float_flag_invalid STATUS_VAR);
5913
    } else if (v > 0xffffffff) {
5914
        res = 0xffffffff;
5915
        float_raise( float_flag_invalid STATUS_VAR);
5916
    } else {
5917
        res = v;
5918
    }
5919
    return res;
5920
}
5921

    
5922
unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5923
{
5924
    int64_t v;
5925
    unsigned int res;
5926

    
5927
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5928
    if (v < 0) {
5929
        res = 0;
5930
        float_raise( float_flag_invalid STATUS_VAR);
5931
    } else if (v > 0xffff) {
5932
        res = 0xffff;
5933
        float_raise( float_flag_invalid STATUS_VAR);
5934
    } else {
5935
        res = v;
5936
    }
5937
    return res;
5938
}
5939

    
5940
/* FIXME: This looks broken.  */
5941
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5942
{
5943
    int64_t v;
5944

    
5945
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5946
    v += float64_val(a);
5947
    v = float64_to_int64(make_float64(v) STATUS_VAR);
5948

    
5949
    return v - INT64_MIN;
5950
}
5951

    
5952
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5953
{
5954
    int64_t v;
5955

    
5956
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5957
    v += float64_val(a);
5958
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5959

    
5960
    return v - INT64_MIN;
5961
}
5962

    
5963
#define COMPARE(s, nan_exp)                                                  \
5964
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5965
                                      int is_quiet STATUS_PARAM )            \
5966
{                                                                            \
5967
    flag aSign, bSign;                                                       \
5968
    uint ## s ## _t av, bv;                                                  \
5969
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5970
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5971
                                                                             \
5972
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5973
         extractFloat ## s ## Frac( a ) ) ||                                 \
5974
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5975
          extractFloat ## s ## Frac( b ) )) {                                \
5976
        if (!is_quiet ||                                                     \
5977
            float ## s ## _is_signaling_nan( a ) ||                          \
5978
            float ## s ## _is_signaling_nan( b ) ) {                         \
5979
            float_raise( float_flag_invalid STATUS_VAR);                     \
5980
        }                                                                    \
5981
        return float_relation_unordered;                                     \
5982
    }                                                                        \
5983
    aSign = extractFloat ## s ## Sign( a );                                  \
5984
    bSign = extractFloat ## s ## Sign( b );                                  \
5985
    av = float ## s ## _val(a);                                              \
5986
    bv = float ## s ## _val(b);                                              \
5987
    if ( aSign != bSign ) {                                                  \
5988
        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
5989
            /* zero case */                                                  \
5990
            return float_relation_equal;                                     \
5991
        } else {                                                             \
5992
            return 1 - (2 * aSign);                                          \
5993
        }                                                                    \
5994
    } else {                                                                 \
5995
        if (av == bv) {                                                      \
5996
            return float_relation_equal;                                     \
5997
        } else {                                                             \
5998
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5999
        }                                                                    \
6000
    }                                                                        \
6001
}                                                                            \
6002
                                                                             \
6003
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6004
{                                                                            \
6005
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6006
}                                                                            \
6007
                                                                             \
6008
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6009
{                                                                            \
6010
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6011
}
6012

    
6013
COMPARE(32, 0xff)
6014
COMPARE(64, 0x7ff)
6015

    
6016
INLINE int float128_compare_internal( float128 a, float128 b,
6017
                                      int is_quiet STATUS_PARAM )
6018
{
6019
    flag aSign, bSign;
6020

    
6021
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6022
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6023
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6024
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6025
        if (!is_quiet ||
6026
            float128_is_signaling_nan( a ) ||
6027
            float128_is_signaling_nan( b ) ) {
6028
            float_raise( float_flag_invalid STATUS_VAR);
6029
        }
6030
        return float_relation_unordered;
6031
    }
6032
    aSign = extractFloat128Sign( a );
6033
    bSign = extractFloat128Sign( b );
6034
    if ( aSign != bSign ) {
6035
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6036
            /* zero case */
6037
            return float_relation_equal;
6038
        } else {
6039
            return 1 - (2 * aSign);
6040
        }
6041
    } else {
6042
        if (a.low == b.low && a.high == b.high) {
6043
            return float_relation_equal;
6044
        } else {
6045
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6046
        }
6047
    }
6048
}
6049

    
6050
int float128_compare( float128 a, float128 b STATUS_PARAM )
6051
{
6052
    return float128_compare_internal(a, b, 0 STATUS_VAR);
6053
}
6054

    
6055
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6056
{
6057
    return float128_compare_internal(a, b, 1 STATUS_VAR);
6058
}
6059

    
6060
/* Multiply A by 2 raised to the power N.  */
6061
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6062
{
6063
    flag aSign;
6064
    int16 aExp;
6065
    uint32_t aSig;
6066

    
6067
    a = float32_squash_input_denormal(a STATUS_VAR);
6068
    aSig = extractFloat32Frac( a );
6069
    aExp = extractFloat32Exp( a );
6070
    aSign = extractFloat32Sign( a );
6071

    
6072
    if ( aExp == 0xFF ) {
6073
        return a;
6074
    }
6075
    if ( aExp != 0 )
6076
        aSig |= 0x00800000;
6077
    else if ( aSig == 0 )
6078
        return a;
6079

    
6080
    aExp += n - 1;
6081
    aSig <<= 7;
6082
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6083
}
6084

    
6085
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6086
{
6087
    flag aSign;
6088
    int16 aExp;
6089
    uint64_t aSig;
6090

    
6091
    a = float64_squash_input_denormal(a STATUS_VAR);
6092
    aSig = extractFloat64Frac( a );
6093
    aExp = extractFloat64Exp( a );
6094
    aSign = extractFloat64Sign( a );
6095

    
6096
    if ( aExp == 0x7FF ) {
6097
        return a;
6098
    }
6099
    if ( aExp != 0 )
6100
        aSig |= LIT64( 0x0010000000000000 );
6101
    else if ( aSig == 0 )
6102
        return a;
6103

    
6104
    aExp += n - 1;
6105
    aSig <<= 10;
6106
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6107
}
6108

    
6109
#ifdef FLOATX80
6110
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6111
{
6112
    flag aSign;
6113
    int16 aExp;
6114
    uint64_t aSig;
6115

    
6116
    aSig = extractFloatx80Frac( a );
6117
    aExp = extractFloatx80Exp( a );
6118
    aSign = extractFloatx80Sign( a );
6119

    
6120
    if ( aExp == 0x7FF ) {
6121
        return a;
6122
    }
6123
    if (aExp == 0 && aSig == 0)
6124
        return a;
6125

    
6126
    aExp += n;
6127
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6128
                                          aSign, aExp, aSig, 0 STATUS_VAR );
6129
}
6130
#endif
6131

    
6132
#ifdef FLOAT128
6133
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6134
{
6135
    flag aSign;
6136
    int32 aExp;
6137
    uint64_t aSig0, aSig1;
6138

    
6139
    aSig1 = extractFloat128Frac1( a );
6140
    aSig0 = extractFloat128Frac0( a );
6141
    aExp = extractFloat128Exp( a );
6142
    aSign = extractFloat128Sign( a );
6143
    if ( aExp == 0x7FFF ) {
6144
        return a;
6145
    }
6146
    if ( aExp != 0 )
6147
        aSig0 |= LIT64( 0x0001000000000000 );
6148
    else if ( aSig0 == 0 && aSig1 == 0 )
6149
        return a;
6150

    
6151
    aExp += n - 1;
6152
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6153
                                          STATUS_VAR );
6154

    
6155
}
6156
#endif