Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 211315fb

History | View | Annotate | Download (225.1 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
#include "softfloat.h"
39

    
40
/*----------------------------------------------------------------------------
41
| Primitive arithmetic functions, including multi-word arithmetic, and
42
| division and square root approximations.  (Can be specialized to target if
43
| desired.)
44
*----------------------------------------------------------------------------*/
45
#include "softfloat-macros.h"
46

    
47
/*----------------------------------------------------------------------------
48
| Functions and definitions to determine:  (1) whether tininess for underflow
49
| is detected before or after rounding by default, (2) what (if anything)
50
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
51
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52
| are propagated from function inputs to output.  These details are target-
53
| specific.
54
*----------------------------------------------------------------------------*/
55
#include "softfloat-specialize.h"
56

    
57
void set_float_rounding_mode(int val STATUS_PARAM)
58
{
59
    STATUS(float_rounding_mode) = val;
60
}
61

    
62
void set_float_exception_flags(int val STATUS_PARAM)
63
{
64
    STATUS(float_exception_flags) = val;
65
}
66

    
67
#ifdef FLOATX80
68
void set_floatx80_rounding_precision(int val STATUS_PARAM)
69
{
70
    STATUS(floatx80_rounding_precision) = val;
71
}
72
#endif
73

    
74
/*----------------------------------------------------------------------------
75
| Returns the fraction bits of the half-precision floating-point value `a'.
76
*----------------------------------------------------------------------------*/
77

    
78
INLINE uint32_t extractFloat16Frac(float16 a)
79
{
80
    return float16_val(a) & 0x3ff;
81
}
82

    
83
/*----------------------------------------------------------------------------
84
| Returns the exponent bits of the half-precision floating-point value `a'.
85
*----------------------------------------------------------------------------*/
86

    
87
INLINE int16 extractFloat16Exp(float16 a)
88
{
89
    return (float16_val(a) >> 10) & 0x1f;
90
}
91

    
92
/*----------------------------------------------------------------------------
93
| Returns the sign bit of the single-precision floating-point value `a'.
94
*----------------------------------------------------------------------------*/
95

    
96
INLINE flag extractFloat16Sign(float16 a)
97
{
98
    return float16_val(a)>>15;
99
}
100

    
101
/*----------------------------------------------------------------------------
102
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103
| and 7, and returns the properly rounded 32-bit integer corresponding to the
104
| input.  If `zSign' is 1, the input is negated before being converted to an
105
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
106
| is simply rounded to an integer, with the inexact exception raised if the
107
| input cannot be represented exactly as an integer.  However, if the fixed-
108
| point input is too large, the invalid exception is raised and the largest
109
| positive or negative integer is returned.
110
*----------------------------------------------------------------------------*/
111

    
112
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
113
{
114
    int8 roundingMode;
115
    flag roundNearestEven;
116
    int8 roundIncrement, roundBits;
117
    int32 z;
118

    
119
    roundingMode = STATUS(float_rounding_mode);
120
    roundNearestEven = ( roundingMode == float_round_nearest_even );
121
    roundIncrement = 0x40;
122
    if ( ! roundNearestEven ) {
123
        if ( roundingMode == float_round_to_zero ) {
124
            roundIncrement = 0;
125
        }
126
        else {
127
            roundIncrement = 0x7F;
128
            if ( zSign ) {
129
                if ( roundingMode == float_round_up ) roundIncrement = 0;
130
            }
131
            else {
132
                if ( roundingMode == float_round_down ) roundIncrement = 0;
133
            }
134
        }
135
    }
136
    roundBits = absZ & 0x7F;
137
    absZ = ( absZ + roundIncrement )>>7;
138
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139
    z = absZ;
140
    if ( zSign ) z = - z;
141
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142
        float_raise( float_flag_invalid STATUS_VAR);
143
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
144
    }
145
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146
    return z;
147

    
148
}
149

    
150
/*----------------------------------------------------------------------------
151
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152
| `absZ1', with binary point between bits 63 and 64 (between the input words),
153
| and returns the properly rounded 64-bit integer corresponding to the input.
154
| If `zSign' is 1, the input is negated before being converted to an integer.
155
| Ordinarily, the fixed-point input is simply rounded to an integer, with
156
| the inexact exception raised if the input cannot be represented exactly as
157
| an integer.  However, if the fixed-point input is too large, the invalid
158
| exception is raised and the largest positive or negative integer is
159
| returned.
160
*----------------------------------------------------------------------------*/
161

    
162
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
163
{
164
    int8 roundingMode;
165
    flag roundNearestEven, increment;
166
    int64 z;
167

    
168
    roundingMode = STATUS(float_rounding_mode);
169
    roundNearestEven = ( roundingMode == float_round_nearest_even );
170
    increment = ( (int64_t) absZ1 < 0 );
171
    if ( ! roundNearestEven ) {
172
        if ( roundingMode == float_round_to_zero ) {
173
            increment = 0;
174
        }
175
        else {
176
            if ( zSign ) {
177
                increment = ( roundingMode == float_round_down ) && absZ1;
178
            }
179
            else {
180
                increment = ( roundingMode == float_round_up ) && absZ1;
181
            }
182
        }
183
    }
184
    if ( increment ) {
185
        ++absZ0;
186
        if ( absZ0 == 0 ) goto overflow;
187
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188
    }
189
    z = absZ0;
190
    if ( zSign ) z = - z;
191
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
192
 overflow:
193
        float_raise( float_flag_invalid STATUS_VAR);
194
        return
195
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
196
            : LIT64( 0x7FFFFFFFFFFFFFFF );
197
    }
198
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199
    return z;
200

    
201
}
202

    
203
/*----------------------------------------------------------------------------
204
| Returns the fraction bits of the single-precision floating-point value `a'.
205
*----------------------------------------------------------------------------*/
206

    
207
INLINE uint32_t extractFloat32Frac( float32 a )
208
{
209

    
210
    return float32_val(a) & 0x007FFFFF;
211

    
212
}
213

    
214
/*----------------------------------------------------------------------------
215
| Returns the exponent bits of the single-precision floating-point value `a'.
216
*----------------------------------------------------------------------------*/
217

    
218
INLINE int16 extractFloat32Exp( float32 a )
219
{
220

    
221
    return ( float32_val(a)>>23 ) & 0xFF;
222

    
223
}
224

    
225
/*----------------------------------------------------------------------------
226
| Returns the sign bit of the single-precision floating-point value `a'.
227
*----------------------------------------------------------------------------*/
228

    
229
INLINE flag extractFloat32Sign( float32 a )
230
{
231

    
232
    return float32_val(a)>>31;
233

    
234
}
235

    
236
/*----------------------------------------------------------------------------
237
| If `a' is denormal and we are in flush-to-zero mode then set the
238
| input-denormal exception and return zero. Otherwise just return the value.
239
*----------------------------------------------------------------------------*/
240
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241
{
242
    if (STATUS(flush_inputs_to_zero)) {
243
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244
            float_raise(float_flag_input_denormal STATUS_VAR);
245
            return make_float32(float32_val(a) & 0x80000000);
246
        }
247
    }
248
    return a;
249
}
250

    
251
/*----------------------------------------------------------------------------
252
| Normalizes the subnormal single-precision floating-point value represented
253
| by the denormalized significand `aSig'.  The normalized exponent and
254
| significand are stored at the locations pointed to by `zExpPtr' and
255
| `zSigPtr', respectively.
256
*----------------------------------------------------------------------------*/
257

    
258
static void
259
 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
260
{
261
    int8 shiftCount;
262

    
263
    shiftCount = countLeadingZeros32( aSig ) - 8;
264
    *zSigPtr = aSig<<shiftCount;
265
    *zExpPtr = 1 - shiftCount;
266

    
267
}
268

    
269
/*----------------------------------------------------------------------------
270
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271
| single-precision floating-point value, returning the result.  After being
272
| shifted into the proper positions, the three fields are simply added
273
| together to form the result.  This means that any integer portion of `zSig'
274
| will be added into the exponent.  Since a properly normalized significand
275
| will have an integer portion equal to 1, the `zExp' input should be 1 less
276
| than the desired result exponent whenever `zSig' is a complete, normalized
277
| significand.
278
*----------------------------------------------------------------------------*/
279

    
280
INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281
{
282

    
283
    return make_float32(
284
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
285

    
286
}
287

    
288
/*----------------------------------------------------------------------------
289
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290
| and significand `zSig', and returns the proper single-precision floating-
291
| point value corresponding to the abstract input.  Ordinarily, the abstract
292
| value is simply rounded and packed into the single-precision format, with
293
| the inexact exception raised if the abstract input cannot be represented
294
| exactly.  However, if the abstract value is too large, the overflow and
295
| inexact exceptions are raised and an infinity or maximal finite value is
296
| returned.  If the abstract value is too small, the input value is rounded to
297
| a subnormal number, and the underflow and inexact exceptions are raised if
298
| the abstract input cannot be represented exactly as a subnormal single-
299
| precision floating-point number.
300
|     The input significand `zSig' has its binary point between bits 30
301
| and 29, which is 7 bits to the left of the usual location.  This shifted
302
| significand must be normalized or smaller.  If `zSig' is not normalized,
303
| `zExp' must be 0; in that case, the result returned is a subnormal number,
304
| and it must not require rounding.  In the usual case that `zSig' is
305
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306
| The handling of underflow and overflow follows the IEC/IEEE Standard for
307
| Binary Floating-Point Arithmetic.
308
*----------------------------------------------------------------------------*/
309

    
310
static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
311
{
312
    int8 roundingMode;
313
    flag roundNearestEven;
314
    int8 roundIncrement, roundBits;
315
    flag isTiny;
316

    
317
    roundingMode = STATUS(float_rounding_mode);
318
    roundNearestEven = ( roundingMode == float_round_nearest_even );
319
    roundIncrement = 0x40;
320
    if ( ! roundNearestEven ) {
321
        if ( roundingMode == float_round_to_zero ) {
322
            roundIncrement = 0;
323
        }
324
        else {
325
            roundIncrement = 0x7F;
326
            if ( zSign ) {
327
                if ( roundingMode == float_round_up ) roundIncrement = 0;
328
            }
329
            else {
330
                if ( roundingMode == float_round_down ) roundIncrement = 0;
331
            }
332
        }
333
    }
334
    roundBits = zSig & 0x7F;
335
    if ( 0xFD <= (uint16_t) zExp ) {
336
        if (    ( 0xFD < zExp )
337
             || (    ( zExp == 0xFD )
338
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
339
           ) {
340
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
342
        }
343
        if ( zExp < 0 ) {
344
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
345
            isTiny =
346
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
347
                || ( zExp < -1 )
348
                || ( zSig + roundIncrement < 0x80000000 );
349
            shift32RightJamming( zSig, - zExp, &zSig );
350
            zExp = 0;
351
            roundBits = zSig & 0x7F;
352
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
353
        }
354
    }
355
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
356
    zSig = ( zSig + roundIncrement )>>7;
357
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
358
    if ( zSig == 0 ) zExp = 0;
359
    return packFloat32( zSign, zExp, zSig );
360

    
361
}
362

    
363
/*----------------------------------------------------------------------------
364
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
365
| and significand `zSig', and returns the proper single-precision floating-
366
| point value corresponding to the abstract input.  This routine is just like
367
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
368
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
369
| floating-point exponent.
370
*----------------------------------------------------------------------------*/
371

    
372
static float32
373
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
374
{
375
    int8 shiftCount;
376

    
377
    shiftCount = countLeadingZeros32( zSig ) - 1;
378
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
379

    
380
}
381

    
382
/*----------------------------------------------------------------------------
383
| Returns the fraction bits of the double-precision floating-point value `a'.
384
*----------------------------------------------------------------------------*/
385

    
386
INLINE uint64_t extractFloat64Frac( float64 a )
387
{
388

    
389
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
390

    
391
}
392

    
393
/*----------------------------------------------------------------------------
394
| Returns the exponent bits of the double-precision floating-point value `a'.
395
*----------------------------------------------------------------------------*/
396

    
397
INLINE int16 extractFloat64Exp( float64 a )
398
{
399

    
400
    return ( float64_val(a)>>52 ) & 0x7FF;
401

    
402
}
403

    
404
/*----------------------------------------------------------------------------
405
| Returns the sign bit of the double-precision floating-point value `a'.
406
*----------------------------------------------------------------------------*/
407

    
408
INLINE flag extractFloat64Sign( float64 a )
409
{
410

    
411
    return float64_val(a)>>63;
412

    
413
}
414

    
415
/*----------------------------------------------------------------------------
416
| If `a' is denormal and we are in flush-to-zero mode then set the
417
| input-denormal exception and return zero. Otherwise just return the value.
418
*----------------------------------------------------------------------------*/
419
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
420
{
421
    if (STATUS(flush_inputs_to_zero)) {
422
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
423
            float_raise(float_flag_input_denormal STATUS_VAR);
424
            return make_float64(float64_val(a) & (1ULL << 63));
425
        }
426
    }
427
    return a;
428
}
429

    
430
/*----------------------------------------------------------------------------
431
| Normalizes the subnormal double-precision floating-point value represented
432
| by the denormalized significand `aSig'.  The normalized exponent and
433
| significand are stored at the locations pointed to by `zExpPtr' and
434
| `zSigPtr', respectively.
435
*----------------------------------------------------------------------------*/
436

    
437
static void
438
 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
439
{
440
    int8 shiftCount;
441

    
442
    shiftCount = countLeadingZeros64( aSig ) - 11;
443
    *zSigPtr = aSig<<shiftCount;
444
    *zExpPtr = 1 - shiftCount;
445

    
446
}
447

    
448
/*----------------------------------------------------------------------------
449
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
450
| double-precision floating-point value, returning the result.  After being
451
| shifted into the proper positions, the three fields are simply added
452
| together to form the result.  This means that any integer portion of `zSig'
453
| will be added into the exponent.  Since a properly normalized significand
454
| will have an integer portion equal to 1, the `zExp' input should be 1 less
455
| than the desired result exponent whenever `zSig' is a complete, normalized
456
| significand.
457
*----------------------------------------------------------------------------*/
458

    
459
INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
460
{
461

    
462
    return make_float64(
463
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
464

    
465
}
466

    
467
/*----------------------------------------------------------------------------
468
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
469
| and significand `zSig', and returns the proper double-precision floating-
470
| point value corresponding to the abstract input.  Ordinarily, the abstract
471
| value is simply rounded and packed into the double-precision format, with
472
| the inexact exception raised if the abstract input cannot be represented
473
| exactly.  However, if the abstract value is too large, the overflow and
474
| inexact exceptions are raised and an infinity or maximal finite value is
475
| returned.  If the abstract value is too small, the input value is rounded
476
| to a subnormal number, and the underflow and inexact exceptions are raised
477
| if the abstract input cannot be represented exactly as a subnormal double-
478
| precision floating-point number.
479
|     The input significand `zSig' has its binary point between bits 62
480
| and 61, which is 10 bits to the left of the usual location.  This shifted
481
| significand must be normalized or smaller.  If `zSig' is not normalized,
482
| `zExp' must be 0; in that case, the result returned is a subnormal number,
483
| and it must not require rounding.  In the usual case that `zSig' is
484
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
485
| The handling of underflow and overflow follows the IEC/IEEE Standard for
486
| Binary Floating-Point Arithmetic.
487
*----------------------------------------------------------------------------*/
488

    
489
static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
490
{
491
    int8 roundingMode;
492
    flag roundNearestEven;
493
    int16 roundIncrement, roundBits;
494
    flag isTiny;
495

    
496
    roundingMode = STATUS(float_rounding_mode);
497
    roundNearestEven = ( roundingMode == float_round_nearest_even );
498
    roundIncrement = 0x200;
499
    if ( ! roundNearestEven ) {
500
        if ( roundingMode == float_round_to_zero ) {
501
            roundIncrement = 0;
502
        }
503
        else {
504
            roundIncrement = 0x3FF;
505
            if ( zSign ) {
506
                if ( roundingMode == float_round_up ) roundIncrement = 0;
507
            }
508
            else {
509
                if ( roundingMode == float_round_down ) roundIncrement = 0;
510
            }
511
        }
512
    }
513
    roundBits = zSig & 0x3FF;
514
    if ( 0x7FD <= (uint16_t) zExp ) {
515
        if (    ( 0x7FD < zExp )
516
             || (    ( zExp == 0x7FD )
517
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
518
           ) {
519
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
520
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
521
        }
522
        if ( zExp < 0 ) {
523
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
524
            isTiny =
525
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
526
                || ( zExp < -1 )
527
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
528
            shift64RightJamming( zSig, - zExp, &zSig );
529
            zExp = 0;
530
            roundBits = zSig & 0x3FF;
531
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
532
        }
533
    }
534
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
535
    zSig = ( zSig + roundIncrement )>>10;
536
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
537
    if ( zSig == 0 ) zExp = 0;
538
    return packFloat64( zSign, zExp, zSig );
539

    
540
}
541

    
542
/*----------------------------------------------------------------------------
543
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544
| and significand `zSig', and returns the proper double-precision floating-
545
| point value corresponding to the abstract input.  This routine is just like
546
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548
| floating-point exponent.
549
*----------------------------------------------------------------------------*/
550

    
551
static float64
552
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
553
{
554
    int8 shiftCount;
555

    
556
    shiftCount = countLeadingZeros64( zSig ) - 1;
557
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
558

    
559
}
560

    
561
#ifdef FLOATX80
562

    
563
/*----------------------------------------------------------------------------
564
| Returns the fraction bits of the extended double-precision floating-point
565
| value `a'.
566
*----------------------------------------------------------------------------*/
567

    
568
INLINE uint64_t extractFloatx80Frac( floatx80 a )
569
{
570

    
571
    return a.low;
572

    
573
}
574

    
575
/*----------------------------------------------------------------------------
576
| Returns the exponent bits of the extended double-precision floating-point
577
| value `a'.
578
*----------------------------------------------------------------------------*/
579

    
580
INLINE int32 extractFloatx80Exp( floatx80 a )
581
{
582

    
583
    return a.high & 0x7FFF;
584

    
585
}
586

    
587
/*----------------------------------------------------------------------------
588
| Returns the sign bit of the extended double-precision floating-point value
589
| `a'.
590
*----------------------------------------------------------------------------*/
591

    
592
INLINE flag extractFloatx80Sign( floatx80 a )
593
{
594

    
595
    return a.high>>15;
596

    
597
}
598

    
599
/*----------------------------------------------------------------------------
600
| Normalizes the subnormal extended double-precision floating-point value
601
| represented by the denormalized significand `aSig'.  The normalized exponent
602
| and significand are stored at the locations pointed to by `zExpPtr' and
603
| `zSigPtr', respectively.
604
*----------------------------------------------------------------------------*/
605

    
606
static void
607
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
608
{
609
    int8 shiftCount;
610

    
611
    shiftCount = countLeadingZeros64( aSig );
612
    *zSigPtr = aSig<<shiftCount;
613
    *zExpPtr = 1 - shiftCount;
614

    
615
}
616

    
617
/*----------------------------------------------------------------------------
618
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619
| extended double-precision floating-point value, returning the result.
620
*----------------------------------------------------------------------------*/
621

    
622
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
623
{
624
    floatx80 z;
625

    
626
    z.low = zSig;
627
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
628
    return z;
629

    
630
}
631

    
632
/*----------------------------------------------------------------------------
633
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
634
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
635
| and returns the proper extended double-precision floating-point value
636
| corresponding to the abstract input.  Ordinarily, the abstract value is
637
| rounded and packed into the extended double-precision format, with the
638
| inexact exception raised if the abstract input cannot be represented
639
| exactly.  However, if the abstract value is too large, the overflow and
640
| inexact exceptions are raised and an infinity or maximal finite value is
641
| returned.  If the abstract value is too small, the input value is rounded to
642
| a subnormal number, and the underflow and inexact exceptions are raised if
643
| the abstract input cannot be represented exactly as a subnormal extended
644
| double-precision floating-point number.
645
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
646
| number of bits as single or double precision, respectively.  Otherwise, the
647
| result is rounded to the full precision of the extended double-precision
648
| format.
649
|     The input significand must be normalized or smaller.  If the input
650
| significand is not normalized, `zExp' must be 0; in that case, the result
651
| returned is a subnormal number, and it must not require rounding.  The
652
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
653
| Floating-Point Arithmetic.
654
*----------------------------------------------------------------------------*/
655

    
656
static floatx80
657
 roundAndPackFloatx80(
658
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
659
 STATUS_PARAM)
660
{
661
    int8 roundingMode;
662
    flag roundNearestEven, increment, isTiny;
663
    int64 roundIncrement, roundMask, roundBits;
664

    
665
    roundingMode = STATUS(float_rounding_mode);
666
    roundNearestEven = ( roundingMode == float_round_nearest_even );
667
    if ( roundingPrecision == 80 ) goto precision80;
668
    if ( roundingPrecision == 64 ) {
669
        roundIncrement = LIT64( 0x0000000000000400 );
670
        roundMask = LIT64( 0x00000000000007FF );
671
    }
672
    else if ( roundingPrecision == 32 ) {
673
        roundIncrement = LIT64( 0x0000008000000000 );
674
        roundMask = LIT64( 0x000000FFFFFFFFFF );
675
    }
676
    else {
677
        goto precision80;
678
    }
679
    zSig0 |= ( zSig1 != 0 );
680
    if ( ! roundNearestEven ) {
681
        if ( roundingMode == float_round_to_zero ) {
682
            roundIncrement = 0;
683
        }
684
        else {
685
            roundIncrement = roundMask;
686
            if ( zSign ) {
687
                if ( roundingMode == float_round_up ) roundIncrement = 0;
688
            }
689
            else {
690
                if ( roundingMode == float_round_down ) roundIncrement = 0;
691
            }
692
        }
693
    }
694
    roundBits = zSig0 & roundMask;
695
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
696
        if (    ( 0x7FFE < zExp )
697
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
698
           ) {
699
            goto overflow;
700
        }
701
        if ( zExp <= 0 ) {
702
            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
703
            isTiny =
704
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
705
                || ( zExp < 0 )
706
                || ( zSig0 <= zSig0 + roundIncrement );
707
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708
            zExp = 0;
709
            roundBits = zSig0 & roundMask;
710
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
711
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
712
            zSig0 += roundIncrement;
713
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
714
            roundIncrement = roundMask + 1;
715
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716
                roundMask |= roundIncrement;
717
            }
718
            zSig0 &= ~ roundMask;
719
            return packFloatx80( zSign, zExp, zSig0 );
720
        }
721
    }
722
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
723
    zSig0 += roundIncrement;
724
    if ( zSig0 < roundIncrement ) {
725
        ++zExp;
726
        zSig0 = LIT64( 0x8000000000000000 );
727
    }
728
    roundIncrement = roundMask + 1;
729
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730
        roundMask |= roundIncrement;
731
    }
732
    zSig0 &= ~ roundMask;
733
    if ( zSig0 == 0 ) zExp = 0;
734
    return packFloatx80( zSign, zExp, zSig0 );
735
 precision80:
736
    increment = ( (int64_t) zSig1 < 0 );
737
    if ( ! roundNearestEven ) {
738
        if ( roundingMode == float_round_to_zero ) {
739
            increment = 0;
740
        }
741
        else {
742
            if ( zSign ) {
743
                increment = ( roundingMode == float_round_down ) && zSig1;
744
            }
745
            else {
746
                increment = ( roundingMode == float_round_up ) && zSig1;
747
            }
748
        }
749
    }
750
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
751
        if (    ( 0x7FFE < zExp )
752
             || (    ( zExp == 0x7FFE )
753
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754
                  && increment
755
                )
756
           ) {
757
            roundMask = 0;
758
 overflow:
759
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
760
            if (    ( roundingMode == float_round_to_zero )
761
                 || ( zSign && ( roundingMode == float_round_up ) )
762
                 || ( ! zSign && ( roundingMode == float_round_down ) )
763
               ) {
764
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
765
            }
766
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
767
        }
768
        if ( zExp <= 0 ) {
769
            isTiny =
770
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
771
                || ( zExp < 0 )
772
                || ! increment
773
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775
            zExp = 0;
776
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
777
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
778
            if ( roundNearestEven ) {
779
                increment = ( (int64_t) zSig1 < 0 );
780
            }
781
            else {
782
                if ( zSign ) {
783
                    increment = ( roundingMode == float_round_down ) && zSig1;
784
                }
785
                else {
786
                    increment = ( roundingMode == float_round_up ) && zSig1;
787
                }
788
            }
789
            if ( increment ) {
790
                ++zSig0;
791
                zSig0 &=
792
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
794
            }
795
            return packFloatx80( zSign, zExp, zSig0 );
796
        }
797
    }
798
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
799
    if ( increment ) {
800
        ++zSig0;
801
        if ( zSig0 == 0 ) {
802
            ++zExp;
803
            zSig0 = LIT64( 0x8000000000000000 );
804
        }
805
        else {
806
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
807
        }
808
    }
809
    else {
810
        if ( zSig0 == 0 ) zExp = 0;
811
    }
812
    return packFloatx80( zSign, zExp, zSig0 );
813

    
814
}
815

    
816
/*----------------------------------------------------------------------------
817
| Takes an abstract floating-point value having sign `zSign', exponent
818
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
819
| and returns the proper extended double-precision floating-point value
820
| corresponding to the abstract input.  This routine is just like
821
| `roundAndPackFloatx80' except that the input significand does not have to be
822
| normalized.
823
*----------------------------------------------------------------------------*/
824

    
825
static floatx80
826
 normalizeRoundAndPackFloatx80(
827
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
828
 STATUS_PARAM)
829
{
830
    int8 shiftCount;
831

    
832
    if ( zSig0 == 0 ) {
833
        zSig0 = zSig1;
834
        zSig1 = 0;
835
        zExp -= 64;
836
    }
837
    shiftCount = countLeadingZeros64( zSig0 );
838
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
839
    zExp -= shiftCount;
840
    return
841
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
842

    
843
}
844

    
845
#endif
846

    
847
#ifdef FLOAT128
848

    
849
/*----------------------------------------------------------------------------
850
| Returns the least-significant 64 fraction bits of the quadruple-precision
851
| floating-point value `a'.
852
*----------------------------------------------------------------------------*/
853

    
854
INLINE uint64_t extractFloat128Frac1( float128 a )
855
{
856

    
857
    return a.low;
858

    
859
}
860

    
861
/*----------------------------------------------------------------------------
862
| Returns the most-significant 48 fraction bits of the quadruple-precision
863
| floating-point value `a'.
864
*----------------------------------------------------------------------------*/
865

    
866
INLINE uint64_t extractFloat128Frac0( float128 a )
867
{
868

    
869
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
870

    
871
}
872

    
873
/*----------------------------------------------------------------------------
874
| Returns the exponent bits of the quadruple-precision floating-point value
875
| `a'.
876
*----------------------------------------------------------------------------*/
877

    
878
INLINE int32 extractFloat128Exp( float128 a )
879
{
880

    
881
    return ( a.high>>48 ) & 0x7FFF;
882

    
883
}
884

    
885
/*----------------------------------------------------------------------------
886
| Returns the sign bit of the quadruple-precision floating-point value `a'.
887
*----------------------------------------------------------------------------*/
888

    
889
INLINE flag extractFloat128Sign( float128 a )
890
{
891

    
892
    return a.high>>63;
893

    
894
}
895

    
896
/*----------------------------------------------------------------------------
897
| Normalizes the subnormal quadruple-precision floating-point value
898
| represented by the denormalized significand formed by the concatenation of
899
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
900
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
901
| significand are stored at the location pointed to by `zSig0Ptr', and the
902
| least significant 64 bits of the normalized significand are stored at the
903
| location pointed to by `zSig1Ptr'.
904
*----------------------------------------------------------------------------*/
905

    
906
static void
907
 normalizeFloat128Subnormal(
908
     uint64_t aSig0,
909
     uint64_t aSig1,
910
     int32 *zExpPtr,
911
     uint64_t *zSig0Ptr,
912
     uint64_t *zSig1Ptr
913
 )
914
{
915
    int8 shiftCount;
916

    
917
    if ( aSig0 == 0 ) {
918
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
919
        if ( shiftCount < 0 ) {
920
            *zSig0Ptr = aSig1>>( - shiftCount );
921
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
922
        }
923
        else {
924
            *zSig0Ptr = aSig1<<shiftCount;
925
            *zSig1Ptr = 0;
926
        }
927
        *zExpPtr = - shiftCount - 63;
928
    }
929
    else {
930
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
931
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
932
        *zExpPtr = 1 - shiftCount;
933
    }
934

    
935
}
936

    
937
/*----------------------------------------------------------------------------
938
| Packs the sign `zSign', the exponent `zExp', and the significand formed
939
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
940
| floating-point value, returning the result.  After being shifted into the
941
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
942
| added together to form the most significant 32 bits of the result.  This
943
| means that any integer portion of `zSig0' will be added into the exponent.
944
| Since a properly normalized significand will have an integer portion equal
945
| to 1, the `zExp' input should be 1 less than the desired result exponent
946
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
947
| significand.
948
*----------------------------------------------------------------------------*/
949

    
950
INLINE float128
951
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
952
{
953
    float128 z;
954

    
955
    z.low = zSig1;
956
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
957
    return z;
958

    
959
}
960

    
961
/*----------------------------------------------------------------------------
962
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963
| and extended significand formed by the concatenation of `zSig0', `zSig1',
964
| and `zSig2', and returns the proper quadruple-precision floating-point value
965
| corresponding to the abstract input.  Ordinarily, the abstract value is
966
| simply rounded and packed into the quadruple-precision format, with the
967
| inexact exception raised if the abstract input cannot be represented
968
| exactly.  However, if the abstract value is too large, the overflow and
969
| inexact exceptions are raised and an infinity or maximal finite value is
970
| returned.  If the abstract value is too small, the input value is rounded to
971
| a subnormal number, and the underflow and inexact exceptions are raised if
972
| the abstract input cannot be represented exactly as a subnormal quadruple-
973
| precision floating-point number.
974
|     The input significand must be normalized or smaller.  If the input
975
| significand is not normalized, `zExp' must be 0; in that case, the result
976
| returned is a subnormal number, and it must not require rounding.  In the
977
| usual case that the input significand is normalized, `zExp' must be 1 less
978
| than the ``true'' floating-point exponent.  The handling of underflow and
979
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980
*----------------------------------------------------------------------------*/
981

    
982
static float128
983
 roundAndPackFloat128(
984
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
985
{
986
    int8 roundingMode;
987
    flag roundNearestEven, increment, isTiny;
988

    
989
    roundingMode = STATUS(float_rounding_mode);
990
    roundNearestEven = ( roundingMode == float_round_nearest_even );
991
    increment = ( (int64_t) zSig2 < 0 );
992
    if ( ! roundNearestEven ) {
993
        if ( roundingMode == float_round_to_zero ) {
994
            increment = 0;
995
        }
996
        else {
997
            if ( zSign ) {
998
                increment = ( roundingMode == float_round_down ) && zSig2;
999
            }
1000
            else {
1001
                increment = ( roundingMode == float_round_up ) && zSig2;
1002
            }
1003
        }
1004
    }
1005
    if ( 0x7FFD <= (uint32_t) zExp ) {
1006
        if (    ( 0x7FFD < zExp )
1007
             || (    ( zExp == 0x7FFD )
1008
                  && eq128(
1009
                         LIT64( 0x0001FFFFFFFFFFFF ),
1010
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1011
                         zSig0,
1012
                         zSig1
1013
                     )
1014
                  && increment
1015
                )
1016
           ) {
1017
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018
            if (    ( roundingMode == float_round_to_zero )
1019
                 || ( zSign && ( roundingMode == float_round_up ) )
1020
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1021
               ) {
1022
                return
1023
                    packFloat128(
1024
                        zSign,
1025
                        0x7FFE,
1026
                        LIT64( 0x0000FFFFFFFFFFFF ),
1027
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1028
                    );
1029
            }
1030
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1031
        }
1032
        if ( zExp < 0 ) {
1033
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034
            isTiny =
1035
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036
                || ( zExp < -1 )
1037
                || ! increment
1038
                || lt128(
1039
                       zSig0,
1040
                       zSig1,
1041
                       LIT64( 0x0001FFFFFFFFFFFF ),
1042
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1043
                   );
1044
            shift128ExtraRightJamming(
1045
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046
            zExp = 0;
1047
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048
            if ( roundNearestEven ) {
1049
                increment = ( (int64_t) zSig2 < 0 );
1050
            }
1051
            else {
1052
                if ( zSign ) {
1053
                    increment = ( roundingMode == float_round_down ) && zSig2;
1054
                }
1055
                else {
1056
                    increment = ( roundingMode == float_round_up ) && zSig2;
1057
                }
1058
            }
1059
        }
1060
    }
1061
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062
    if ( increment ) {
1063
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065
    }
1066
    else {
1067
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068
    }
1069
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1070

    
1071
}
1072

    
1073
/*----------------------------------------------------------------------------
1074
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1076
| returns the proper quadruple-precision floating-point value corresponding
1077
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1078
| except that the input significand has fewer bits and does not have to be
1079
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080
| point exponent.
1081
*----------------------------------------------------------------------------*/
1082

    
1083
static float128
1084
 normalizeRoundAndPackFloat128(
1085
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086
{
1087
    int8 shiftCount;
1088
    uint64_t zSig2;
1089

    
1090
    if ( zSig0 == 0 ) {
1091
        zSig0 = zSig1;
1092
        zSig1 = 0;
1093
        zExp -= 64;
1094
    }
1095
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096
    if ( 0 <= shiftCount ) {
1097
        zSig2 = 0;
1098
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099
    }
1100
    else {
1101
        shift128ExtraRightJamming(
1102
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103
    }
1104
    zExp -= shiftCount;
1105
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106

    
1107
}
1108

    
1109
#endif
1110

    
1111
/*----------------------------------------------------------------------------
1112
| Returns the result of converting the 32-bit two's complement integer `a'
1113
| to the single-precision floating-point format.  The conversion is performed
1114
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115
*----------------------------------------------------------------------------*/
1116

    
1117
float32 int32_to_float32( int32 a STATUS_PARAM )
1118
{
1119
    flag zSign;
1120

    
1121
    if ( a == 0 ) return float32_zero;
1122
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123
    zSign = ( a < 0 );
1124
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125

    
1126
}
1127

    
1128
/*----------------------------------------------------------------------------
1129
| Returns the result of converting the 32-bit two's complement integer `a'
1130
| to the double-precision floating-point format.  The conversion is performed
1131
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132
*----------------------------------------------------------------------------*/
1133

    
1134
float64 int32_to_float64( int32 a STATUS_PARAM )
1135
{
1136
    flag zSign;
1137
    uint32 absA;
1138
    int8 shiftCount;
1139
    uint64_t zSig;
1140

    
1141
    if ( a == 0 ) return float64_zero;
1142
    zSign = ( a < 0 );
1143
    absA = zSign ? - a : a;
1144
    shiftCount = countLeadingZeros32( absA ) + 21;
1145
    zSig = absA;
1146
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147

    
1148
}
1149

    
1150
#ifdef FLOATX80
1151

    
1152
/*----------------------------------------------------------------------------
1153
| Returns the result of converting the 32-bit two's complement integer `a'
1154
| to the extended double-precision floating-point format.  The conversion
1155
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156
| Arithmetic.
1157
*----------------------------------------------------------------------------*/
1158

    
1159
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160
{
1161
    flag zSign;
1162
    uint32 absA;
1163
    int8 shiftCount;
1164
    uint64_t zSig;
1165

    
1166
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167
    zSign = ( a < 0 );
1168
    absA = zSign ? - a : a;
1169
    shiftCount = countLeadingZeros32( absA ) + 32;
1170
    zSig = absA;
1171
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172

    
1173
}
1174

    
1175
#endif
1176

    
1177
#ifdef FLOAT128
1178

    
1179
/*----------------------------------------------------------------------------
1180
| Returns the result of converting the 32-bit two's complement integer `a' to
1181
| the quadruple-precision floating-point format.  The conversion is performed
1182
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183
*----------------------------------------------------------------------------*/
1184

    
1185
float128 int32_to_float128( int32 a STATUS_PARAM )
1186
{
1187
    flag zSign;
1188
    uint32 absA;
1189
    int8 shiftCount;
1190
    uint64_t zSig0;
1191

    
1192
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193
    zSign = ( a < 0 );
1194
    absA = zSign ? - a : a;
1195
    shiftCount = countLeadingZeros32( absA ) + 17;
1196
    zSig0 = absA;
1197
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198

    
1199
}
1200

    
1201
#endif
1202

    
1203
/*----------------------------------------------------------------------------
1204
| Returns the result of converting the 64-bit two's complement integer `a'
1205
| to the single-precision floating-point format.  The conversion is performed
1206
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207
*----------------------------------------------------------------------------*/
1208

    
1209
float32 int64_to_float32( int64 a STATUS_PARAM )
1210
{
1211
    flag zSign;
1212
    uint64 absA;
1213
    int8 shiftCount;
1214

    
1215
    if ( a == 0 ) return float32_zero;
1216
    zSign = ( a < 0 );
1217
    absA = zSign ? - a : a;
1218
    shiftCount = countLeadingZeros64( absA ) - 40;
1219
    if ( 0 <= shiftCount ) {
1220
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221
    }
1222
    else {
1223
        shiftCount += 7;
1224
        if ( shiftCount < 0 ) {
1225
            shift64RightJamming( absA, - shiftCount, &absA );
1226
        }
1227
        else {
1228
            absA <<= shiftCount;
1229
        }
1230
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231
    }
1232

    
1233
}
1234

    
1235
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236
{
1237
    int8 shiftCount;
1238

    
1239
    if ( a == 0 ) return float32_zero;
1240
    shiftCount = countLeadingZeros64( a ) - 40;
1241
    if ( 0 <= shiftCount ) {
1242
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243
    }
1244
    else {
1245
        shiftCount += 7;
1246
        if ( shiftCount < 0 ) {
1247
            shift64RightJamming( a, - shiftCount, &a );
1248
        }
1249
        else {
1250
            a <<= shiftCount;
1251
        }
1252
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253
    }
1254
}
1255

    
1256
/*----------------------------------------------------------------------------
1257
| Returns the result of converting the 64-bit two's complement integer `a'
1258
| to the double-precision floating-point format.  The conversion is performed
1259
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260
*----------------------------------------------------------------------------*/
1261

    
1262
float64 int64_to_float64( int64 a STATUS_PARAM )
1263
{
1264
    flag zSign;
1265

    
1266
    if ( a == 0 ) return float64_zero;
1267
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268
        return packFloat64( 1, 0x43E, 0 );
1269
    }
1270
    zSign = ( a < 0 );
1271
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272

    
1273
}
1274

    
1275
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276
{
1277
    if ( a == 0 ) return float64_zero;
1278
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279

    
1280
}
1281

    
1282
#ifdef FLOATX80
1283

    
1284
/*----------------------------------------------------------------------------
1285
| Returns the result of converting the 64-bit two's complement integer `a'
1286
| to the extended double-precision floating-point format.  The conversion
1287
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288
| Arithmetic.
1289
*----------------------------------------------------------------------------*/
1290

    
1291
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292
{
1293
    flag zSign;
1294
    uint64 absA;
1295
    int8 shiftCount;
1296

    
1297
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298
    zSign = ( a < 0 );
1299
    absA = zSign ? - a : a;
1300
    shiftCount = countLeadingZeros64( absA );
1301
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302

    
1303
}
1304

    
1305
#endif
1306

    
1307
#ifdef FLOAT128
1308

    
1309
/*----------------------------------------------------------------------------
1310
| Returns the result of converting the 64-bit two's complement integer `a' to
1311
| the quadruple-precision floating-point format.  The conversion is performed
1312
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313
*----------------------------------------------------------------------------*/
1314

    
1315
float128 int64_to_float128( int64 a STATUS_PARAM )
1316
{
1317
    flag zSign;
1318
    uint64 absA;
1319
    int8 shiftCount;
1320
    int32 zExp;
1321
    uint64_t zSig0, zSig1;
1322

    
1323
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324
    zSign = ( a < 0 );
1325
    absA = zSign ? - a : a;
1326
    shiftCount = countLeadingZeros64( absA ) + 49;
1327
    zExp = 0x406E - shiftCount;
1328
    if ( 64 <= shiftCount ) {
1329
        zSig1 = 0;
1330
        zSig0 = absA;
1331
        shiftCount -= 64;
1332
    }
1333
    else {
1334
        zSig1 = absA;
1335
        zSig0 = 0;
1336
    }
1337
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1339

    
1340
}
1341

    
1342
#endif
1343

    
1344
/*----------------------------------------------------------------------------
1345
| Returns the result of converting the single-precision floating-point value
1346
| `a' to the 32-bit two's complement integer format.  The conversion is
1347
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1348
| Arithmetic---which means in particular that the conversion is rounded
1349
| according to the current rounding mode.  If `a' is a NaN, the largest
1350
| positive integer is returned.  Otherwise, if the conversion overflows, the
1351
| largest integer with the same sign as `a' is returned.
1352
*----------------------------------------------------------------------------*/
1353

    
1354
int32 float32_to_int32( float32 a STATUS_PARAM )
1355
{
1356
    flag aSign;
1357
    int16 aExp, shiftCount;
1358
    uint32_t aSig;
1359
    uint64_t aSig64;
1360

    
1361
    a = float32_squash_input_denormal(a STATUS_VAR);
1362
    aSig = extractFloat32Frac( a );
1363
    aExp = extractFloat32Exp( a );
1364
    aSign = extractFloat32Sign( a );
1365
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366
    if ( aExp ) aSig |= 0x00800000;
1367
    shiftCount = 0xAF - aExp;
1368
    aSig64 = aSig;
1369
    aSig64 <<= 32;
1370
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372

    
1373
}
1374

    
1375
/*----------------------------------------------------------------------------
1376
| Returns the result of converting the single-precision floating-point value
1377
| `a' to the 32-bit two's complement integer format.  The conversion is
1378
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1379
| Arithmetic, except that the conversion is always rounded toward zero.
1380
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381
| the conversion overflows, the largest integer with the same sign as `a' is
1382
| returned.
1383
*----------------------------------------------------------------------------*/
1384

    
1385
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386
{
1387
    flag aSign;
1388
    int16 aExp, shiftCount;
1389
    uint32_t aSig;
1390
    int32 z;
1391
    a = float32_squash_input_denormal(a STATUS_VAR);
1392

    
1393
    aSig = extractFloat32Frac( a );
1394
    aExp = extractFloat32Exp( a );
1395
    aSign = extractFloat32Sign( a );
1396
    shiftCount = aExp - 0x9E;
1397
    if ( 0 <= shiftCount ) {
1398
        if ( float32_val(a) != 0xCF000000 ) {
1399
            float_raise( float_flag_invalid STATUS_VAR);
1400
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401
        }
1402
        return (int32_t) 0x80000000;
1403
    }
1404
    else if ( aExp <= 0x7E ) {
1405
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406
        return 0;
1407
    }
1408
    aSig = ( aSig | 0x00800000 )<<8;
1409
    z = aSig>>( - shiftCount );
1410
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411
        STATUS(float_exception_flags) |= float_flag_inexact;
1412
    }
1413
    if ( aSign ) z = - z;
1414
    return z;
1415

    
1416
}
1417

    
1418
/*----------------------------------------------------------------------------
1419
| Returns the result of converting the single-precision floating-point value
1420
| `a' to the 16-bit two's complement integer format.  The conversion is
1421
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1422
| Arithmetic, except that the conversion is always rounded toward zero.
1423
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424
| the conversion overflows, the largest integer with the same sign as `a' is
1425
| returned.
1426
*----------------------------------------------------------------------------*/
1427

    
1428
int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429
{
1430
    flag aSign;
1431
    int16 aExp, shiftCount;
1432
    uint32_t aSig;
1433
    int32 z;
1434

    
1435
    aSig = extractFloat32Frac( a );
1436
    aExp = extractFloat32Exp( a );
1437
    aSign = extractFloat32Sign( a );
1438
    shiftCount = aExp - 0x8E;
1439
    if ( 0 <= shiftCount ) {
1440
        if ( float32_val(a) != 0xC7000000 ) {
1441
            float_raise( float_flag_invalid STATUS_VAR);
1442
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443
                return 0x7FFF;
1444
            }
1445
        }
1446
        return (int32_t) 0xffff8000;
1447
    }
1448
    else if ( aExp <= 0x7E ) {
1449
        if ( aExp | aSig ) {
1450
            STATUS(float_exception_flags) |= float_flag_inexact;
1451
        }
1452
        return 0;
1453
    }
1454
    shiftCount -= 0x10;
1455
    aSig = ( aSig | 0x00800000 )<<8;
1456
    z = aSig>>( - shiftCount );
1457
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458
        STATUS(float_exception_flags) |= float_flag_inexact;
1459
    }
1460
    if ( aSign ) {
1461
        z = - z;
1462
    }
1463
    return z;
1464

    
1465
}
1466

    
1467
/*----------------------------------------------------------------------------
1468
| Returns the result of converting the single-precision floating-point value
1469
| `a' to the 64-bit two's complement integer format.  The conversion is
1470
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1471
| Arithmetic---which means in particular that the conversion is rounded
1472
| according to the current rounding mode.  If `a' is a NaN, the largest
1473
| positive integer is returned.  Otherwise, if the conversion overflows, the
1474
| largest integer with the same sign as `a' is returned.
1475
*----------------------------------------------------------------------------*/
1476

    
1477
int64 float32_to_int64( float32 a STATUS_PARAM )
1478
{
1479
    flag aSign;
1480
    int16 aExp, shiftCount;
1481
    uint32_t aSig;
1482
    uint64_t aSig64, aSigExtra;
1483
    a = float32_squash_input_denormal(a STATUS_VAR);
1484

    
1485
    aSig = extractFloat32Frac( a );
1486
    aExp = extractFloat32Exp( a );
1487
    aSign = extractFloat32Sign( a );
1488
    shiftCount = 0xBE - aExp;
1489
    if ( shiftCount < 0 ) {
1490
        float_raise( float_flag_invalid STATUS_VAR);
1491
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1493
        }
1494
        return (int64_t) LIT64( 0x8000000000000000 );
1495
    }
1496
    if ( aExp ) aSig |= 0x00800000;
1497
    aSig64 = aSig;
1498
    aSig64 <<= 40;
1499
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501

    
1502
}
1503

    
1504
/*----------------------------------------------------------------------------
1505
| Returns the result of converting the single-precision floating-point value
1506
| `a' to the 64-bit two's complement integer format.  The conversion is
1507
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1508
| Arithmetic, except that the conversion is always rounded toward zero.  If
1509
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510
| conversion overflows, the largest integer with the same sign as `a' is
1511
| returned.
1512
*----------------------------------------------------------------------------*/
1513

    
1514
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515
{
1516
    flag aSign;
1517
    int16 aExp, shiftCount;
1518
    uint32_t aSig;
1519
    uint64_t aSig64;
1520
    int64 z;
1521
    a = float32_squash_input_denormal(a STATUS_VAR);
1522

    
1523
    aSig = extractFloat32Frac( a );
1524
    aExp = extractFloat32Exp( a );
1525
    aSign = extractFloat32Sign( a );
1526
    shiftCount = aExp - 0xBE;
1527
    if ( 0 <= shiftCount ) {
1528
        if ( float32_val(a) != 0xDF000000 ) {
1529
            float_raise( float_flag_invalid STATUS_VAR);
1530
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1532
            }
1533
        }
1534
        return (int64_t) LIT64( 0x8000000000000000 );
1535
    }
1536
    else if ( aExp <= 0x7E ) {
1537
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538
        return 0;
1539
    }
1540
    aSig64 = aSig | 0x00800000;
1541
    aSig64 <<= 40;
1542
    z = aSig64>>( - shiftCount );
1543
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544
        STATUS(float_exception_flags) |= float_flag_inexact;
1545
    }
1546
    if ( aSign ) z = - z;
1547
    return z;
1548

    
1549
}
1550

    
1551
/*----------------------------------------------------------------------------
1552
| Returns the result of converting the single-precision floating-point value
1553
| `a' to the double-precision floating-point format.  The conversion is
1554
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1555
| Arithmetic.
1556
*----------------------------------------------------------------------------*/
1557

    
1558
float64 float32_to_float64( float32 a STATUS_PARAM )
1559
{
1560
    flag aSign;
1561
    int16 aExp;
1562
    uint32_t aSig;
1563
    a = float32_squash_input_denormal(a STATUS_VAR);
1564

    
1565
    aSig = extractFloat32Frac( a );
1566
    aExp = extractFloat32Exp( a );
1567
    aSign = extractFloat32Sign( a );
1568
    if ( aExp == 0xFF ) {
1569
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570
        return packFloat64( aSign, 0x7FF, 0 );
1571
    }
1572
    if ( aExp == 0 ) {
1573
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575
        --aExp;
1576
    }
1577
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578

    
1579
}
1580

    
1581
#ifdef FLOATX80
1582

    
1583
/*----------------------------------------------------------------------------
1584
| Returns the result of converting the single-precision floating-point value
1585
| `a' to the extended double-precision floating-point format.  The conversion
1586
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587
| Arithmetic.
1588
*----------------------------------------------------------------------------*/
1589

    
1590
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591
{
1592
    flag aSign;
1593
    int16 aExp;
1594
    uint32_t aSig;
1595

    
1596
    a = float32_squash_input_denormal(a STATUS_VAR);
1597
    aSig = extractFloat32Frac( a );
1598
    aExp = extractFloat32Exp( a );
1599
    aSign = extractFloat32Sign( a );
1600
    if ( aExp == 0xFF ) {
1601
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603
    }
1604
    if ( aExp == 0 ) {
1605
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607
    }
1608
    aSig |= 0x00800000;
1609
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610

    
1611
}
1612

    
1613
#endif
1614

    
1615
#ifdef FLOAT128
1616

    
1617
/*----------------------------------------------------------------------------
1618
| Returns the result of converting the single-precision floating-point value
1619
| `a' to the double-precision floating-point format.  The conversion is
1620
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1621
| Arithmetic.
1622
*----------------------------------------------------------------------------*/
1623

    
1624
float128 float32_to_float128( float32 a STATUS_PARAM )
1625
{
1626
    flag aSign;
1627
    int16 aExp;
1628
    uint32_t aSig;
1629

    
1630
    a = float32_squash_input_denormal(a STATUS_VAR);
1631
    aSig = extractFloat32Frac( a );
1632
    aExp = extractFloat32Exp( a );
1633
    aSign = extractFloat32Sign( a );
1634
    if ( aExp == 0xFF ) {
1635
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1637
    }
1638
    if ( aExp == 0 ) {
1639
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641
        --aExp;
1642
    }
1643
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644

    
1645
}
1646

    
1647
#endif
1648

    
1649
/*----------------------------------------------------------------------------
1650
| Rounds the single-precision floating-point value `a' to an integer, and
1651
| returns the result as a single-precision floating-point value.  The
1652
| operation is performed according to the IEC/IEEE Standard for Binary
1653
| Floating-Point Arithmetic.
1654
*----------------------------------------------------------------------------*/
1655

    
1656
float32 float32_round_to_int( float32 a STATUS_PARAM)
1657
{
1658
    flag aSign;
1659
    int16 aExp;
1660
    uint32_t lastBitMask, roundBitsMask;
1661
    int8 roundingMode;
1662
    uint32_t z;
1663
    a = float32_squash_input_denormal(a STATUS_VAR);
1664

    
1665
    aExp = extractFloat32Exp( a );
1666
    if ( 0x96 <= aExp ) {
1667
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668
            return propagateFloat32NaN( a, a STATUS_VAR );
1669
        }
1670
        return a;
1671
    }
1672
    if ( aExp <= 0x7E ) {
1673
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674
        STATUS(float_exception_flags) |= float_flag_inexact;
1675
        aSign = extractFloat32Sign( a );
1676
        switch ( STATUS(float_rounding_mode) ) {
1677
         case float_round_nearest_even:
1678
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679
                return packFloat32( aSign, 0x7F, 0 );
1680
            }
1681
            break;
1682
         case float_round_down:
1683
            return make_float32(aSign ? 0xBF800000 : 0);
1684
         case float_round_up:
1685
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686
        }
1687
        return packFloat32( aSign, 0, 0 );
1688
    }
1689
    lastBitMask = 1;
1690
    lastBitMask <<= 0x96 - aExp;
1691
    roundBitsMask = lastBitMask - 1;
1692
    z = float32_val(a);
1693
    roundingMode = STATUS(float_rounding_mode);
1694
    if ( roundingMode == float_round_nearest_even ) {
1695
        z += lastBitMask>>1;
1696
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697
    }
1698
    else if ( roundingMode != float_round_to_zero ) {
1699
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700
            z += roundBitsMask;
1701
        }
1702
    }
1703
    z &= ~ roundBitsMask;
1704
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705
    return make_float32(z);
1706

    
1707
}
1708

    
1709
/*----------------------------------------------------------------------------
1710
| Returns the result of adding the absolute values of the single-precision
1711
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712
| before being returned.  `zSign' is ignored if the result is a NaN.
1713
| The addition is performed according to the IEC/IEEE Standard for Binary
1714
| Floating-Point Arithmetic.
1715
*----------------------------------------------------------------------------*/
1716

    
1717
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718
{
1719
    int16 aExp, bExp, zExp;
1720
    uint32_t aSig, bSig, zSig;
1721
    int16 expDiff;
1722

    
1723
    aSig = extractFloat32Frac( a );
1724
    aExp = extractFloat32Exp( a );
1725
    bSig = extractFloat32Frac( b );
1726
    bExp = extractFloat32Exp( b );
1727
    expDiff = aExp - bExp;
1728
    aSig <<= 6;
1729
    bSig <<= 6;
1730
    if ( 0 < expDiff ) {
1731
        if ( aExp == 0xFF ) {
1732
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733
            return a;
1734
        }
1735
        if ( bExp == 0 ) {
1736
            --expDiff;
1737
        }
1738
        else {
1739
            bSig |= 0x20000000;
1740
        }
1741
        shift32RightJamming( bSig, expDiff, &bSig );
1742
        zExp = aExp;
1743
    }
1744
    else if ( expDiff < 0 ) {
1745
        if ( bExp == 0xFF ) {
1746
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747
            return packFloat32( zSign, 0xFF, 0 );
1748
        }
1749
        if ( aExp == 0 ) {
1750
            ++expDiff;
1751
        }
1752
        else {
1753
            aSig |= 0x20000000;
1754
        }
1755
        shift32RightJamming( aSig, - expDiff, &aSig );
1756
        zExp = bExp;
1757
    }
1758
    else {
1759
        if ( aExp == 0xFF ) {
1760
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761
            return a;
1762
        }
1763
        if ( aExp == 0 ) {
1764
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766
        }
1767
        zSig = 0x40000000 + aSig + bSig;
1768
        zExp = aExp;
1769
        goto roundAndPack;
1770
    }
1771
    aSig |= 0x20000000;
1772
    zSig = ( aSig + bSig )<<1;
1773
    --zExp;
1774
    if ( (int32_t) zSig < 0 ) {
1775
        zSig = aSig + bSig;
1776
        ++zExp;
1777
    }
1778
 roundAndPack:
1779
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780

    
1781
}
1782

    
1783
/*----------------------------------------------------------------------------
1784
| Returns the result of subtracting the absolute values of the single-
1785
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786
| difference is negated before being returned.  `zSign' is ignored if the
1787
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788
| Standard for Binary Floating-Point Arithmetic.
1789
*----------------------------------------------------------------------------*/
1790

    
1791
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792
{
1793
    int16 aExp, bExp, zExp;
1794
    uint32_t aSig, bSig, zSig;
1795
    int16 expDiff;
1796

    
1797
    aSig = extractFloat32Frac( a );
1798
    aExp = extractFloat32Exp( a );
1799
    bSig = extractFloat32Frac( b );
1800
    bExp = extractFloat32Exp( b );
1801
    expDiff = aExp - bExp;
1802
    aSig <<= 7;
1803
    bSig <<= 7;
1804
    if ( 0 < expDiff ) goto aExpBigger;
1805
    if ( expDiff < 0 ) goto bExpBigger;
1806
    if ( aExp == 0xFF ) {
1807
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808
        float_raise( float_flag_invalid STATUS_VAR);
1809
        return float32_default_nan;
1810
    }
1811
    if ( aExp == 0 ) {
1812
        aExp = 1;
1813
        bExp = 1;
1814
    }
1815
    if ( bSig < aSig ) goto aBigger;
1816
    if ( aSig < bSig ) goto bBigger;
1817
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818
 bExpBigger:
1819
    if ( bExp == 0xFF ) {
1820
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1822
    }
1823
    if ( aExp == 0 ) {
1824
        ++expDiff;
1825
    }
1826
    else {
1827
        aSig |= 0x40000000;
1828
    }
1829
    shift32RightJamming( aSig, - expDiff, &aSig );
1830
    bSig |= 0x40000000;
1831
 bBigger:
1832
    zSig = bSig - aSig;
1833
    zExp = bExp;
1834
    zSign ^= 1;
1835
    goto normalizeRoundAndPack;
1836
 aExpBigger:
1837
    if ( aExp == 0xFF ) {
1838
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839
        return a;
1840
    }
1841
    if ( bExp == 0 ) {
1842
        --expDiff;
1843
    }
1844
    else {
1845
        bSig |= 0x40000000;
1846
    }
1847
    shift32RightJamming( bSig, expDiff, &bSig );
1848
    aSig |= 0x40000000;
1849
 aBigger:
1850
    zSig = aSig - bSig;
1851
    zExp = aExp;
1852
 normalizeRoundAndPack:
1853
    --zExp;
1854
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855

    
1856
}
1857

    
1858
/*----------------------------------------------------------------------------
1859
| Returns the result of adding the single-precision floating-point values `a'
1860
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861
| Binary Floating-Point Arithmetic.
1862
*----------------------------------------------------------------------------*/
1863

    
1864
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865
{
1866
    flag aSign, bSign;
1867
    a = float32_squash_input_denormal(a STATUS_VAR);
1868
    b = float32_squash_input_denormal(b STATUS_VAR);
1869

    
1870
    aSign = extractFloat32Sign( a );
1871
    bSign = extractFloat32Sign( b );
1872
    if ( aSign == bSign ) {
1873
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874
    }
1875
    else {
1876
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877
    }
1878

    
1879
}
1880

    
1881
/*----------------------------------------------------------------------------
1882
| Returns the result of subtracting the single-precision floating-point values
1883
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884
| for Binary Floating-Point Arithmetic.
1885
*----------------------------------------------------------------------------*/
1886

    
1887
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888
{
1889
    flag aSign, bSign;
1890
    a = float32_squash_input_denormal(a STATUS_VAR);
1891
    b = float32_squash_input_denormal(b STATUS_VAR);
1892

    
1893
    aSign = extractFloat32Sign( a );
1894
    bSign = extractFloat32Sign( b );
1895
    if ( aSign == bSign ) {
1896
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897
    }
1898
    else {
1899
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900
    }
1901

    
1902
}
1903

    
1904
/*----------------------------------------------------------------------------
1905
| Returns the result of multiplying the single-precision floating-point values
1906
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907
| for Binary Floating-Point Arithmetic.
1908
*----------------------------------------------------------------------------*/
1909

    
1910
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911
{
1912
    flag aSign, bSign, zSign;
1913
    int16 aExp, bExp, zExp;
1914
    uint32_t aSig, bSig;
1915
    uint64_t zSig64;
1916
    uint32_t zSig;
1917

    
1918
    a = float32_squash_input_denormal(a STATUS_VAR);
1919
    b = float32_squash_input_denormal(b STATUS_VAR);
1920

    
1921
    aSig = extractFloat32Frac( a );
1922
    aExp = extractFloat32Exp( a );
1923
    aSign = extractFloat32Sign( a );
1924
    bSig = extractFloat32Frac( b );
1925
    bExp = extractFloat32Exp( b );
1926
    bSign = extractFloat32Sign( b );
1927
    zSign = aSign ^ bSign;
1928
    if ( aExp == 0xFF ) {
1929
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930
            return propagateFloat32NaN( a, b STATUS_VAR );
1931
        }
1932
        if ( ( bExp | bSig ) == 0 ) {
1933
            float_raise( float_flag_invalid STATUS_VAR);
1934
            return float32_default_nan;
1935
        }
1936
        return packFloat32( zSign, 0xFF, 0 );
1937
    }
1938
    if ( bExp == 0xFF ) {
1939
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940
        if ( ( aExp | aSig ) == 0 ) {
1941
            float_raise( float_flag_invalid STATUS_VAR);
1942
            return float32_default_nan;
1943
        }
1944
        return packFloat32( zSign, 0xFF, 0 );
1945
    }
1946
    if ( aExp == 0 ) {
1947
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949
    }
1950
    if ( bExp == 0 ) {
1951
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953
    }
1954
    zExp = aExp + bExp - 0x7F;
1955
    aSig = ( aSig | 0x00800000 )<<7;
1956
    bSig = ( bSig | 0x00800000 )<<8;
1957
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958
    zSig = zSig64;
1959
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960
        zSig <<= 1;
1961
        --zExp;
1962
    }
1963
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964

    
1965
}
1966

    
1967
/*----------------------------------------------------------------------------
1968
| Returns the result of dividing the single-precision floating-point value `a'
1969
| by the corresponding value `b'.  The operation is performed according to the
1970
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971
*----------------------------------------------------------------------------*/
1972

    
1973
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974
{
1975
    flag aSign, bSign, zSign;
1976
    int16 aExp, bExp, zExp;
1977
    uint32_t aSig, bSig, zSig;
1978
    a = float32_squash_input_denormal(a STATUS_VAR);
1979
    b = float32_squash_input_denormal(b STATUS_VAR);
1980

    
1981
    aSig = extractFloat32Frac( a );
1982
    aExp = extractFloat32Exp( a );
1983
    aSign = extractFloat32Sign( a );
1984
    bSig = extractFloat32Frac( b );
1985
    bExp = extractFloat32Exp( b );
1986
    bSign = extractFloat32Sign( b );
1987
    zSign = aSign ^ bSign;
1988
    if ( aExp == 0xFF ) {
1989
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990
        if ( bExp == 0xFF ) {
1991
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992
            float_raise( float_flag_invalid STATUS_VAR);
1993
            return float32_default_nan;
1994
        }
1995
        return packFloat32( zSign, 0xFF, 0 );
1996
    }
1997
    if ( bExp == 0xFF ) {
1998
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999
        return packFloat32( zSign, 0, 0 );
2000
    }
2001
    if ( bExp == 0 ) {
2002
        if ( bSig == 0 ) {
2003
            if ( ( aExp | aSig ) == 0 ) {
2004
                float_raise( float_flag_invalid STATUS_VAR);
2005
                return float32_default_nan;
2006
            }
2007
            float_raise( float_flag_divbyzero STATUS_VAR);
2008
            return packFloat32( zSign, 0xFF, 0 );
2009
        }
2010
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011
    }
2012
    if ( aExp == 0 ) {
2013
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015
    }
2016
    zExp = aExp - bExp + 0x7D;
2017
    aSig = ( aSig | 0x00800000 )<<7;
2018
    bSig = ( bSig | 0x00800000 )<<8;
2019
    if ( bSig <= ( aSig + aSig ) ) {
2020
        aSig >>= 1;
2021
        ++zExp;
2022
    }
2023
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024
    if ( ( zSig & 0x3F ) == 0 ) {
2025
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026
    }
2027
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028

    
2029
}
2030

    
2031
/*----------------------------------------------------------------------------
2032
| Returns the remainder of the single-precision floating-point value `a'
2033
| with respect to the corresponding value `b'.  The operation is performed
2034
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035
*----------------------------------------------------------------------------*/
2036

    
2037
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038
{
2039
    flag aSign, zSign;
2040
    int16 aExp, bExp, expDiff;
2041
    uint32_t aSig, bSig;
2042
    uint32_t q;
2043
    uint64_t aSig64, bSig64, q64;
2044
    uint32_t alternateASig;
2045
    int32_t sigMean;
2046
    a = float32_squash_input_denormal(a STATUS_VAR);
2047
    b = float32_squash_input_denormal(b STATUS_VAR);
2048

    
2049
    aSig = extractFloat32Frac( a );
2050
    aExp = extractFloat32Exp( a );
2051
    aSign = extractFloat32Sign( a );
2052
    bSig = extractFloat32Frac( b );
2053
    bExp = extractFloat32Exp( b );
2054
    if ( aExp == 0xFF ) {
2055
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056
            return propagateFloat32NaN( a, b STATUS_VAR );
2057
        }
2058
        float_raise( float_flag_invalid STATUS_VAR);
2059
        return float32_default_nan;
2060
    }
2061
    if ( bExp == 0xFF ) {
2062
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063
        return a;
2064
    }
2065
    if ( bExp == 0 ) {
2066
        if ( bSig == 0 ) {
2067
            float_raise( float_flag_invalid STATUS_VAR);
2068
            return float32_default_nan;
2069
        }
2070
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071
    }
2072
    if ( aExp == 0 ) {
2073
        if ( aSig == 0 ) return a;
2074
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075
    }
2076
    expDiff = aExp - bExp;
2077
    aSig |= 0x00800000;
2078
    bSig |= 0x00800000;
2079
    if ( expDiff < 32 ) {
2080
        aSig <<= 8;
2081
        bSig <<= 8;
2082
        if ( expDiff < 0 ) {
2083
            if ( expDiff < -1 ) return a;
2084
            aSig >>= 1;
2085
        }
2086
        q = ( bSig <= aSig );
2087
        if ( q ) aSig -= bSig;
2088
        if ( 0 < expDiff ) {
2089
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090
            q >>= 32 - expDiff;
2091
            bSig >>= 2;
2092
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093
        }
2094
        else {
2095
            aSig >>= 2;
2096
            bSig >>= 2;
2097
        }
2098
    }
2099
    else {
2100
        if ( bSig <= aSig ) aSig -= bSig;
2101
        aSig64 = ( (uint64_t) aSig )<<40;
2102
        bSig64 = ( (uint64_t) bSig )<<40;
2103
        expDiff -= 64;
2104
        while ( 0 < expDiff ) {
2105
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107
            aSig64 = - ( ( bSig * q64 )<<38 );
2108
            expDiff -= 62;
2109
        }
2110
        expDiff += 64;
2111
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113
        q = q64>>( 64 - expDiff );
2114
        bSig <<= 6;
2115
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116
    }
2117
    do {
2118
        alternateASig = aSig;
2119
        ++q;
2120
        aSig -= bSig;
2121
    } while ( 0 <= (int32_t) aSig );
2122
    sigMean = aSig + alternateASig;
2123
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124
        aSig = alternateASig;
2125
    }
2126
    zSign = ( (int32_t) aSig < 0 );
2127
    if ( zSign ) aSig = - aSig;
2128
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129

    
2130
}
2131

    
2132
/*----------------------------------------------------------------------------
2133
| Returns the square root of the single-precision floating-point value `a'.
2134
| The operation is performed according to the IEC/IEEE Standard for Binary
2135
| Floating-Point Arithmetic.
2136
*----------------------------------------------------------------------------*/
2137

    
2138
float32 float32_sqrt( float32 a STATUS_PARAM )
2139
{
2140
    flag aSign;
2141
    int16 aExp, zExp;
2142
    uint32_t aSig, zSig;
2143
    uint64_t rem, term;
2144
    a = float32_squash_input_denormal(a STATUS_VAR);
2145

    
2146
    aSig = extractFloat32Frac( a );
2147
    aExp = extractFloat32Exp( a );
2148
    aSign = extractFloat32Sign( a );
2149
    if ( aExp == 0xFF ) {
2150
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151
        if ( ! aSign ) return a;
2152
        float_raise( float_flag_invalid STATUS_VAR);
2153
        return float32_default_nan;
2154
    }
2155
    if ( aSign ) {
2156
        if ( ( aExp | aSig ) == 0 ) return a;
2157
        float_raise( float_flag_invalid STATUS_VAR);
2158
        return float32_default_nan;
2159
    }
2160
    if ( aExp == 0 ) {
2161
        if ( aSig == 0 ) return float32_zero;
2162
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163
    }
2164
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165
    aSig = ( aSig | 0x00800000 )<<8;
2166
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2167
    if ( ( zSig & 0x7F ) <= 5 ) {
2168
        if ( zSig < 2 ) {
2169
            zSig = 0x7FFFFFFF;
2170
            goto roundAndPack;
2171
        }
2172
        aSig >>= aExp & 1;
2173
        term = ( (uint64_t) zSig ) * zSig;
2174
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175
        while ( (int64_t) rem < 0 ) {
2176
            --zSig;
2177
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178
        }
2179
        zSig |= ( rem != 0 );
2180
    }
2181
    shift32RightJamming( zSig, 1, &zSig );
2182
 roundAndPack:
2183
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184

    
2185
}
2186

    
2187
/*----------------------------------------------------------------------------
2188
| Returns the binary exponential of the single-precision floating-point value
2189
| `a'. The operation is performed according to the IEC/IEEE Standard for
2190
| Binary Floating-Point Arithmetic.
2191
|
2192
| Uses the following identities:
2193
|
2194
| 1. -------------------------------------------------------------------------
2195
|      x    x*ln(2)
2196
|     2  = e
2197
|
2198
| 2. -------------------------------------------------------------------------
2199
|                      2     3     4     5           n
2200
|      x        x     x     x     x     x           x
2201
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202
|               1!    2!    3!    4!    5!          n!
2203
*----------------------------------------------------------------------------*/
2204

    
2205
static const float64 float32_exp2_coefficients[15] =
2206
{
2207
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2208
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2209
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2210
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2211
    const_float64( 0x3f81111111111111ll ), /*  5 */
2212
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222
};
2223

    
2224
float32 float32_exp2( float32 a STATUS_PARAM )
2225
{
2226
    flag aSign;
2227
    int16 aExp;
2228
    uint32_t aSig;
2229
    float64 r, x, xn;
2230
    int i;
2231
    a = float32_squash_input_denormal(a STATUS_VAR);
2232

    
2233
    aSig = extractFloat32Frac( a );
2234
    aExp = extractFloat32Exp( a );
2235
    aSign = extractFloat32Sign( a );
2236

    
2237
    if ( aExp == 0xFF) {
2238
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239
        return (aSign) ? float32_zero : a;
2240
    }
2241
    if (aExp == 0) {
2242
        if (aSig == 0) return float32_one;
2243
    }
2244

    
2245
    float_raise( float_flag_inexact STATUS_VAR);
2246

    
2247
    /* ******************************* */
2248
    /* using float64 for approximation */
2249
    /* ******************************* */
2250
    x = float32_to_float64(a STATUS_VAR);
2251
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2252

    
2253
    xn = x;
2254
    r = float64_one;
2255
    for (i = 0 ; i < 15 ; i++) {
2256
        float64 f;
2257

    
2258
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259
        r = float64_add(r, f STATUS_VAR);
2260

    
2261
        xn = float64_mul(xn, x STATUS_VAR);
2262
    }
2263

    
2264
    return float64_to_float32(r, status);
2265
}
2266

    
2267
/*----------------------------------------------------------------------------
2268
| Returns the binary log of the single-precision floating-point value `a'.
2269
| The operation is performed according to the IEC/IEEE Standard for Binary
2270
| Floating-Point Arithmetic.
2271
*----------------------------------------------------------------------------*/
2272
float32 float32_log2( float32 a STATUS_PARAM )
2273
{
2274
    flag aSign, zSign;
2275
    int16 aExp;
2276
    uint32_t aSig, zSig, i;
2277

    
2278
    a = float32_squash_input_denormal(a STATUS_VAR);
2279
    aSig = extractFloat32Frac( a );
2280
    aExp = extractFloat32Exp( a );
2281
    aSign = extractFloat32Sign( a );
2282

    
2283
    if ( aExp == 0 ) {
2284
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286
    }
2287
    if ( aSign ) {
2288
        float_raise( float_flag_invalid STATUS_VAR);
2289
        return float32_default_nan;
2290
    }
2291
    if ( aExp == 0xFF ) {
2292
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293
        return a;
2294
    }
2295

    
2296
    aExp -= 0x7F;
2297
    aSig |= 0x00800000;
2298
    zSign = aExp < 0;
2299
    zSig = aExp << 23;
2300

    
2301
    for (i = 1 << 22; i > 0; i >>= 1) {
2302
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303
        if ( aSig & 0x01000000 ) {
2304
            aSig >>= 1;
2305
            zSig |= i;
2306
        }
2307
    }
2308

    
2309
    if ( zSign )
2310
        zSig = -zSig;
2311

    
2312
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313
}
2314

    
2315
/*----------------------------------------------------------------------------
2316
| Returns 1 if the single-precision floating-point value `a' is equal to
2317
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2318
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319
*----------------------------------------------------------------------------*/
2320

    
2321
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2322
{
2323
    a = float32_squash_input_denormal(a STATUS_VAR);
2324
    b = float32_squash_input_denormal(b STATUS_VAR);
2325

    
2326
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2328
       ) {
2329
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330
            float_raise( float_flag_invalid STATUS_VAR);
2331
        }
2332
        return 0;
2333
    }
2334
    return ( float32_val(a) == float32_val(b) ) ||
2335
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2336

    
2337
}
2338

    
2339
/*----------------------------------------------------------------------------
2340
| Returns 1 if the single-precision floating-point value `a' is less than
2341
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2342
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343
| Arithmetic.
2344
*----------------------------------------------------------------------------*/
2345

    
2346
int float32_le( float32 a, float32 b STATUS_PARAM )
2347
{
2348
    flag aSign, bSign;
2349
    uint32_t av, bv;
2350
    a = float32_squash_input_denormal(a STATUS_VAR);
2351
    b = float32_squash_input_denormal(b STATUS_VAR);
2352

    
2353
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355
       ) {
2356
        float_raise( float_flag_invalid STATUS_VAR);
2357
        return 0;
2358
    }
2359
    aSign = extractFloat32Sign( a );
2360
    bSign = extractFloat32Sign( b );
2361
    av = float32_val(a);
2362
    bv = float32_val(b);
2363
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365

    
2366
}
2367

    
2368
/*----------------------------------------------------------------------------
2369
| Returns 1 if the single-precision floating-point value `a' is less than
2370
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2371
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372
*----------------------------------------------------------------------------*/
2373

    
2374
int float32_lt( float32 a, float32 b STATUS_PARAM )
2375
{
2376
    flag aSign, bSign;
2377
    uint32_t av, bv;
2378
    a = float32_squash_input_denormal(a STATUS_VAR);
2379
    b = float32_squash_input_denormal(b STATUS_VAR);
2380

    
2381
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383
       ) {
2384
        float_raise( float_flag_invalid STATUS_VAR);
2385
        return 0;
2386
    }
2387
    aSign = extractFloat32Sign( a );
2388
    bSign = extractFloat32Sign( b );
2389
    av = float32_val(a);
2390
    bv = float32_val(b);
2391
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393

    
2394
}
2395

    
2396
/*----------------------------------------------------------------------------
2397
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2398
| be compared, and 0 otherwise.  The comparison is performed according to the
2399
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2400
*----------------------------------------------------------------------------*/
2401

    
2402
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2403
{
2404
    a = float32_squash_input_denormal(a STATUS_VAR);
2405
    b = float32_squash_input_denormal(b STATUS_VAR);
2406

    
2407
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2408
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2409
       ) {
2410
        float_raise( float_flag_invalid STATUS_VAR);
2411
        return 1;
2412
    }
2413
    return 0;
2414
}
2415
/*----------------------------------------------------------------------------
2416
| Returns 1 if the single-precision floating-point value `a' is equal to
2417
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2418
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2419
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2420
*----------------------------------------------------------------------------*/
2421

    
2422
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2423
{
2424
    uint32_t av, bv;
2425
    a = float32_squash_input_denormal(a STATUS_VAR);
2426
    b = float32_squash_input_denormal(b STATUS_VAR);
2427

    
2428
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2429
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2430
       ) {
2431
        float_raise( float_flag_invalid STATUS_VAR);
2432
        return 0;
2433
    }
2434
    av = float32_val(a);
2435
    bv = float32_val(b);
2436
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2437

    
2438
}
2439

    
2440
/*----------------------------------------------------------------------------
2441
| Returns 1 if the single-precision floating-point value `a' is less than or
2442
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2443
| cause an exception.  Otherwise, the comparison is performed according to the
2444
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2445
*----------------------------------------------------------------------------*/
2446

    
2447
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2448
{
2449
    flag aSign, bSign;
2450
    uint32_t av, bv;
2451
    a = float32_squash_input_denormal(a STATUS_VAR);
2452
    b = float32_squash_input_denormal(b STATUS_VAR);
2453

    
2454
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2455
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2456
       ) {
2457
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2458
            float_raise( float_flag_invalid STATUS_VAR);
2459
        }
2460
        return 0;
2461
    }
2462
    aSign = extractFloat32Sign( a );
2463
    bSign = extractFloat32Sign( b );
2464
    av = float32_val(a);
2465
    bv = float32_val(b);
2466
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2467
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2468

    
2469
}
2470

    
2471
/*----------------------------------------------------------------------------
2472
| Returns 1 if the single-precision floating-point value `a' is less than
2473
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2474
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2475
| Standard for Binary Floating-Point Arithmetic.
2476
*----------------------------------------------------------------------------*/
2477

    
2478
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2479
{
2480
    flag aSign, bSign;
2481
    uint32_t av, bv;
2482
    a = float32_squash_input_denormal(a STATUS_VAR);
2483
    b = float32_squash_input_denormal(b STATUS_VAR);
2484

    
2485
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2486
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2487
       ) {
2488
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2489
            float_raise( float_flag_invalid STATUS_VAR);
2490
        }
2491
        return 0;
2492
    }
2493
    aSign = extractFloat32Sign( a );
2494
    bSign = extractFloat32Sign( b );
2495
    av = float32_val(a);
2496
    bv = float32_val(b);
2497
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2498
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2499

    
2500
}
2501

    
2502
/*----------------------------------------------------------------------------
2503
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2504
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2505
| comparison is performed according to the IEC/IEEE Standard for Binary
2506
| Floating-Point Arithmetic.
2507
*----------------------------------------------------------------------------*/
2508

    
2509
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2510
{
2511
    a = float32_squash_input_denormal(a STATUS_VAR);
2512
    b = float32_squash_input_denormal(b STATUS_VAR);
2513

    
2514
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2515
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2516
       ) {
2517
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2518
            float_raise( float_flag_invalid STATUS_VAR);
2519
        }
2520
        return 1;
2521
    }
2522
    return 0;
2523
}
2524

    
2525
/*----------------------------------------------------------------------------
2526
| Returns the result of converting the double-precision floating-point value
2527
| `a' to the 32-bit two's complement integer format.  The conversion is
2528
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2529
| Arithmetic---which means in particular that the conversion is rounded
2530
| according to the current rounding mode.  If `a' is a NaN, the largest
2531
| positive integer is returned.  Otherwise, if the conversion overflows, the
2532
| largest integer with the same sign as `a' is returned.
2533
*----------------------------------------------------------------------------*/
2534

    
2535
int32 float64_to_int32( float64 a STATUS_PARAM )
2536
{
2537
    flag aSign;
2538
    int16 aExp, shiftCount;
2539
    uint64_t aSig;
2540
    a = float64_squash_input_denormal(a STATUS_VAR);
2541

    
2542
    aSig = extractFloat64Frac( a );
2543
    aExp = extractFloat64Exp( a );
2544
    aSign = extractFloat64Sign( a );
2545
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2546
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2547
    shiftCount = 0x42C - aExp;
2548
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2549
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2550

    
2551
}
2552

    
2553
/*----------------------------------------------------------------------------
2554
| Returns the result of converting the double-precision floating-point value
2555
| `a' to the 32-bit two's complement integer format.  The conversion is
2556
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2557
| Arithmetic, except that the conversion is always rounded toward zero.
2558
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2559
| the conversion overflows, the largest integer with the same sign as `a' is
2560
| returned.
2561
*----------------------------------------------------------------------------*/
2562

    
2563
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2564
{
2565
    flag aSign;
2566
    int16 aExp, shiftCount;
2567
    uint64_t aSig, savedASig;
2568
    int32 z;
2569
    a = float64_squash_input_denormal(a STATUS_VAR);
2570

    
2571
    aSig = extractFloat64Frac( a );
2572
    aExp = extractFloat64Exp( a );
2573
    aSign = extractFloat64Sign( a );
2574
    if ( 0x41E < aExp ) {
2575
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2576
        goto invalid;
2577
    }
2578
    else if ( aExp < 0x3FF ) {
2579
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2580
        return 0;
2581
    }
2582
    aSig |= LIT64( 0x0010000000000000 );
2583
    shiftCount = 0x433 - aExp;
2584
    savedASig = aSig;
2585
    aSig >>= shiftCount;
2586
    z = aSig;
2587
    if ( aSign ) z = - z;
2588
    if ( ( z < 0 ) ^ aSign ) {
2589
 invalid:
2590
        float_raise( float_flag_invalid STATUS_VAR);
2591
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2592
    }
2593
    if ( ( aSig<<shiftCount ) != savedASig ) {
2594
        STATUS(float_exception_flags) |= float_flag_inexact;
2595
    }
2596
    return z;
2597

    
2598
}
2599

    
2600
/*----------------------------------------------------------------------------
2601
| Returns the result of converting the double-precision floating-point value
2602
| `a' to the 16-bit two's complement integer format.  The conversion is
2603
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2604
| Arithmetic, except that the conversion is always rounded toward zero.
2605
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2606
| the conversion overflows, the largest integer with the same sign as `a' is
2607
| returned.
2608
*----------------------------------------------------------------------------*/
2609

    
2610
int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2611
{
2612
    flag aSign;
2613
    int16 aExp, shiftCount;
2614
    uint64_t aSig, savedASig;
2615
    int32 z;
2616

    
2617
    aSig = extractFloat64Frac( a );
2618
    aExp = extractFloat64Exp( a );
2619
    aSign = extractFloat64Sign( a );
2620
    if ( 0x40E < aExp ) {
2621
        if ( ( aExp == 0x7FF ) && aSig ) {
2622
            aSign = 0;
2623
        }
2624
        goto invalid;
2625
    }
2626
    else if ( aExp < 0x3FF ) {
2627
        if ( aExp || aSig ) {
2628
            STATUS(float_exception_flags) |= float_flag_inexact;
2629
        }
2630
        return 0;
2631
    }
2632
    aSig |= LIT64( 0x0010000000000000 );
2633
    shiftCount = 0x433 - aExp;
2634
    savedASig = aSig;
2635
    aSig >>= shiftCount;
2636
    z = aSig;
2637
    if ( aSign ) {
2638
        z = - z;
2639
    }
2640
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2641
 invalid:
2642
        float_raise( float_flag_invalid STATUS_VAR);
2643
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2644
    }
2645
    if ( ( aSig<<shiftCount ) != savedASig ) {
2646
        STATUS(float_exception_flags) |= float_flag_inexact;
2647
    }
2648
    return z;
2649
}
2650

    
2651
/*----------------------------------------------------------------------------
2652
| Returns the result of converting the double-precision floating-point value
2653
| `a' to the 64-bit two's complement integer format.  The conversion is
2654
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2655
| Arithmetic---which means in particular that the conversion is rounded
2656
| according to the current rounding mode.  If `a' is a NaN, the largest
2657
| positive integer is returned.  Otherwise, if the conversion overflows, the
2658
| largest integer with the same sign as `a' is returned.
2659
*----------------------------------------------------------------------------*/
2660

    
2661
int64 float64_to_int64( float64 a STATUS_PARAM )
2662
{
2663
    flag aSign;
2664
    int16 aExp, shiftCount;
2665
    uint64_t aSig, aSigExtra;
2666
    a = float64_squash_input_denormal(a STATUS_VAR);
2667

    
2668
    aSig = extractFloat64Frac( a );
2669
    aExp = extractFloat64Exp( a );
2670
    aSign = extractFloat64Sign( a );
2671
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2672
    shiftCount = 0x433 - aExp;
2673
    if ( shiftCount <= 0 ) {
2674
        if ( 0x43E < aExp ) {
2675
            float_raise( float_flag_invalid STATUS_VAR);
2676
            if (    ! aSign
2677
                 || (    ( aExp == 0x7FF )
2678
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2679
               ) {
2680
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2681
            }
2682
            return (int64_t) LIT64( 0x8000000000000000 );
2683
        }
2684
        aSigExtra = 0;
2685
        aSig <<= - shiftCount;
2686
    }
2687
    else {
2688
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2689
    }
2690
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2691

    
2692
}
2693

    
2694
/*----------------------------------------------------------------------------
2695
| Returns the result of converting the double-precision floating-point value
2696
| `a' to the 64-bit two's complement integer format.  The conversion is
2697
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2698
| Arithmetic, except that the conversion is always rounded toward zero.
2699
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2700
| the conversion overflows, the largest integer with the same sign as `a' is
2701
| returned.
2702
*----------------------------------------------------------------------------*/
2703

    
2704
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2705
{
2706
    flag aSign;
2707
    int16 aExp, shiftCount;
2708
    uint64_t aSig;
2709
    int64 z;
2710
    a = float64_squash_input_denormal(a STATUS_VAR);
2711

    
2712
    aSig = extractFloat64Frac( a );
2713
    aExp = extractFloat64Exp( a );
2714
    aSign = extractFloat64Sign( a );
2715
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2716
    shiftCount = aExp - 0x433;
2717
    if ( 0 <= shiftCount ) {
2718
        if ( 0x43E <= aExp ) {
2719
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2720
                float_raise( float_flag_invalid STATUS_VAR);
2721
                if (    ! aSign
2722
                     || (    ( aExp == 0x7FF )
2723
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2724
                   ) {
2725
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2726
                }
2727
            }
2728
            return (int64_t) LIT64( 0x8000000000000000 );
2729
        }
2730
        z = aSig<<shiftCount;
2731
    }
2732
    else {
2733
        if ( aExp < 0x3FE ) {
2734
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2735
            return 0;
2736
        }
2737
        z = aSig>>( - shiftCount );
2738
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2739
            STATUS(float_exception_flags) |= float_flag_inexact;
2740
        }
2741
    }
2742
    if ( aSign ) z = - z;
2743
    return z;
2744

    
2745
}
2746

    
2747
/*----------------------------------------------------------------------------
2748
| Returns the result of converting the double-precision floating-point value
2749
| `a' to the single-precision floating-point format.  The conversion is
2750
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2751
| Arithmetic.
2752
*----------------------------------------------------------------------------*/
2753

    
2754
float32 float64_to_float32( float64 a STATUS_PARAM )
2755
{
2756
    flag aSign;
2757
    int16 aExp;
2758
    uint64_t aSig;
2759
    uint32_t zSig;
2760
    a = float64_squash_input_denormal(a STATUS_VAR);
2761

    
2762
    aSig = extractFloat64Frac( a );
2763
    aExp = extractFloat64Exp( a );
2764
    aSign = extractFloat64Sign( a );
2765
    if ( aExp == 0x7FF ) {
2766
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2767
        return packFloat32( aSign, 0xFF, 0 );
2768
    }
2769
    shift64RightJamming( aSig, 22, &aSig );
2770
    zSig = aSig;
2771
    if ( aExp || zSig ) {
2772
        zSig |= 0x40000000;
2773
        aExp -= 0x381;
2774
    }
2775
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2776

    
2777
}
2778

    
2779

    
2780
/*----------------------------------------------------------------------------
2781
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2782
| half-precision floating-point value, returning the result.  After being
2783
| shifted into the proper positions, the three fields are simply added
2784
| together to form the result.  This means that any integer portion of `zSig'
2785
| will be added into the exponent.  Since a properly normalized significand
2786
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2787
| than the desired result exponent whenever `zSig' is a complete, normalized
2788
| significand.
2789
*----------------------------------------------------------------------------*/
2790
static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2791
{
2792
    return make_float16(
2793
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2794
}
2795

    
2796
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2797
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2798

    
2799
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2800
{
2801
    flag aSign;
2802
    int16 aExp;
2803
    uint32_t aSig;
2804

    
2805
    aSign = extractFloat16Sign(a);
2806
    aExp = extractFloat16Exp(a);
2807
    aSig = extractFloat16Frac(a);
2808

    
2809
    if (aExp == 0x1f && ieee) {
2810
        if (aSig) {
2811
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2812
        }
2813
        return packFloat32(aSign, 0xff, aSig << 13);
2814
    }
2815
    if (aExp == 0) {
2816
        int8 shiftCount;
2817

    
2818
        if (aSig == 0) {
2819
            return packFloat32(aSign, 0, 0);
2820
        }
2821

    
2822
        shiftCount = countLeadingZeros32( aSig ) - 21;
2823
        aSig = aSig << shiftCount;
2824
        aExp = -shiftCount;
2825
    }
2826
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2827
}
2828

    
2829
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2830
{
2831
    flag aSign;
2832
    int16 aExp;
2833
    uint32_t aSig;
2834
    uint32_t mask;
2835
    uint32_t increment;
2836
    int8 roundingMode;
2837
    a = float32_squash_input_denormal(a STATUS_VAR);
2838

    
2839
    aSig = extractFloat32Frac( a );
2840
    aExp = extractFloat32Exp( a );
2841
    aSign = extractFloat32Sign( a );
2842
    if ( aExp == 0xFF ) {
2843
        if (aSig) {
2844
            /* Input is a NaN */
2845
            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2846
            if (!ieee) {
2847
                return packFloat16(aSign, 0, 0);
2848
            }
2849
            return r;
2850
        }
2851
        /* Infinity */
2852
        if (!ieee) {
2853
            float_raise(float_flag_invalid STATUS_VAR);
2854
            return packFloat16(aSign, 0x1f, 0x3ff);
2855
        }
2856
        return packFloat16(aSign, 0x1f, 0);
2857
    }
2858
    if (aExp == 0 && aSig == 0) {
2859
        return packFloat16(aSign, 0, 0);
2860
    }
2861
    /* Decimal point between bits 22 and 23.  */
2862
    aSig |= 0x00800000;
2863
    aExp -= 0x7f;
2864
    if (aExp < -14) {
2865
        mask = 0x00ffffff;
2866
        if (aExp >= -24) {
2867
            mask >>= 25 + aExp;
2868
        }
2869
    } else {
2870
        mask = 0x00001fff;
2871
    }
2872
    if (aSig & mask) {
2873
        float_raise( float_flag_underflow STATUS_VAR );
2874
        roundingMode = STATUS(float_rounding_mode);
2875
        switch (roundingMode) {
2876
        case float_round_nearest_even:
2877
            increment = (mask + 1) >> 1;
2878
            if ((aSig & mask) == increment) {
2879
                increment = aSig & (increment << 1);
2880
            }
2881
            break;
2882
        case float_round_up:
2883
            increment = aSign ? 0 : mask;
2884
            break;
2885
        case float_round_down:
2886
            increment = aSign ? mask : 0;
2887
            break;
2888
        default: /* round_to_zero */
2889
            increment = 0;
2890
            break;
2891
        }
2892
        aSig += increment;
2893
        if (aSig >= 0x01000000) {
2894
            aSig >>= 1;
2895
            aExp++;
2896
        }
2897
    } else if (aExp < -14
2898
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2899
        float_raise( float_flag_underflow STATUS_VAR);
2900
    }
2901

    
2902
    if (ieee) {
2903
        if (aExp > 15) {
2904
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2905
            return packFloat16(aSign, 0x1f, 0);
2906
        }
2907
    } else {
2908
        if (aExp > 16) {
2909
            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2910
            return packFloat16(aSign, 0x1f, 0x3ff);
2911
        }
2912
    }
2913
    if (aExp < -24) {
2914
        return packFloat16(aSign, 0, 0);
2915
    }
2916
    if (aExp < -14) {
2917
        aSig >>= -14 - aExp;
2918
        aExp = -14;
2919
    }
2920
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2921
}
2922

    
2923
#ifdef FLOATX80
2924

    
2925
/*----------------------------------------------------------------------------
2926
| Returns the result of converting the double-precision floating-point value
2927
| `a' to the extended double-precision floating-point format.  The conversion
2928
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2929
| Arithmetic.
2930
*----------------------------------------------------------------------------*/
2931

    
2932
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2933
{
2934
    flag aSign;
2935
    int16 aExp;
2936
    uint64_t aSig;
2937

    
2938
    a = float64_squash_input_denormal(a STATUS_VAR);
2939
    aSig = extractFloat64Frac( a );
2940
    aExp = extractFloat64Exp( a );
2941
    aSign = extractFloat64Sign( a );
2942
    if ( aExp == 0x7FF ) {
2943
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2944
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2945
    }
2946
    if ( aExp == 0 ) {
2947
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2948
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2949
    }
2950
    return
2951
        packFloatx80(
2952
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2953

    
2954
}
2955

    
2956
#endif
2957

    
2958
#ifdef FLOAT128
2959

    
2960
/*----------------------------------------------------------------------------
2961
| Returns the result of converting the double-precision floating-point value
2962
| `a' to the quadruple-precision floating-point format.  The conversion is
2963
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2964
| Arithmetic.
2965
*----------------------------------------------------------------------------*/
2966

    
2967
float128 float64_to_float128( float64 a STATUS_PARAM )
2968
{
2969
    flag aSign;
2970
    int16 aExp;
2971
    uint64_t aSig, zSig0, zSig1;
2972

    
2973
    a = float64_squash_input_denormal(a STATUS_VAR);
2974
    aSig = extractFloat64Frac( a );
2975
    aExp = extractFloat64Exp( a );
2976
    aSign = extractFloat64Sign( a );
2977
    if ( aExp == 0x7FF ) {
2978
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2979
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2980
    }
2981
    if ( aExp == 0 ) {
2982
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2983
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2984
        --aExp;
2985
    }
2986
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2987
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2988

    
2989
}
2990

    
2991
#endif
2992

    
2993
/*----------------------------------------------------------------------------
2994
| Rounds the double-precision floating-point value `a' to an integer, and
2995
| returns the result as a double-precision floating-point value.  The
2996
| operation is performed according to the IEC/IEEE Standard for Binary
2997
| Floating-Point Arithmetic.
2998
*----------------------------------------------------------------------------*/
2999

    
3000
float64 float64_round_to_int( float64 a STATUS_PARAM )
3001
{
3002
    flag aSign;
3003
    int16 aExp;
3004
    uint64_t lastBitMask, roundBitsMask;
3005
    int8 roundingMode;
3006
    uint64_t z;
3007
    a = float64_squash_input_denormal(a STATUS_VAR);
3008

    
3009
    aExp = extractFloat64Exp( a );
3010
    if ( 0x433 <= aExp ) {
3011
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3012
            return propagateFloat64NaN( a, a STATUS_VAR );
3013
        }
3014
        return a;
3015
    }
3016
    if ( aExp < 0x3FF ) {
3017
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3018
        STATUS(float_exception_flags) |= float_flag_inexact;
3019
        aSign = extractFloat64Sign( a );
3020
        switch ( STATUS(float_rounding_mode) ) {
3021
         case float_round_nearest_even:
3022
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3023
                return packFloat64( aSign, 0x3FF, 0 );
3024
            }
3025
            break;
3026
         case float_round_down:
3027
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3028
         case float_round_up:
3029
            return make_float64(
3030
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3031
        }
3032
        return packFloat64( aSign, 0, 0 );
3033
    }
3034
    lastBitMask = 1;
3035
    lastBitMask <<= 0x433 - aExp;
3036
    roundBitsMask = lastBitMask - 1;
3037
    z = float64_val(a);
3038
    roundingMode = STATUS(float_rounding_mode);
3039
    if ( roundingMode == float_round_nearest_even ) {
3040
        z += lastBitMask>>1;
3041
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3042
    }
3043
    else if ( roundingMode != float_round_to_zero ) {
3044
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3045
            z += roundBitsMask;
3046
        }
3047
    }
3048
    z &= ~ roundBitsMask;
3049
    if ( z != float64_val(a) )
3050
        STATUS(float_exception_flags) |= float_flag_inexact;
3051
    return make_float64(z);
3052

    
3053
}
3054

    
3055
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3056
{
3057
    int oldmode;
3058
    float64 res;
3059
    oldmode = STATUS(float_rounding_mode);
3060
    STATUS(float_rounding_mode) = float_round_to_zero;
3061
    res = float64_round_to_int(a STATUS_VAR);
3062
    STATUS(float_rounding_mode) = oldmode;
3063
    return res;
3064
}
3065

    
3066
/*----------------------------------------------------------------------------
3067
| Returns the result of adding the absolute values of the double-precision
3068
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3069
| before being returned.  `zSign' is ignored if the result is a NaN.
3070
| The addition is performed according to the IEC/IEEE Standard for Binary
3071
| Floating-Point Arithmetic.
3072
*----------------------------------------------------------------------------*/
3073

    
3074
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3075
{
3076
    int16 aExp, bExp, zExp;
3077
    uint64_t aSig, bSig, zSig;
3078
    int16 expDiff;
3079

    
3080
    aSig = extractFloat64Frac( a );
3081
    aExp = extractFloat64Exp( a );
3082
    bSig = extractFloat64Frac( b );
3083
    bExp = extractFloat64Exp( b );
3084
    expDiff = aExp - bExp;
3085
    aSig <<= 9;
3086
    bSig <<= 9;
3087
    if ( 0 < expDiff ) {
3088
        if ( aExp == 0x7FF ) {
3089
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090
            return a;
3091
        }
3092
        if ( bExp == 0 ) {
3093
            --expDiff;
3094
        }
3095
        else {
3096
            bSig |= LIT64( 0x2000000000000000 );
3097
        }
3098
        shift64RightJamming( bSig, expDiff, &bSig );
3099
        zExp = aExp;
3100
    }
3101
    else if ( expDiff < 0 ) {
3102
        if ( bExp == 0x7FF ) {
3103
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3104
            return packFloat64( zSign, 0x7FF, 0 );
3105
        }
3106
        if ( aExp == 0 ) {
3107
            ++expDiff;
3108
        }
3109
        else {
3110
            aSig |= LIT64( 0x2000000000000000 );
3111
        }
3112
        shift64RightJamming( aSig, - expDiff, &aSig );
3113
        zExp = bExp;
3114
    }
3115
    else {
3116
        if ( aExp == 0x7FF ) {
3117
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3118
            return a;
3119
        }
3120
        if ( aExp == 0 ) {
3121
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3122
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3123
        }
3124
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3125
        zExp = aExp;
3126
        goto roundAndPack;
3127
    }
3128
    aSig |= LIT64( 0x2000000000000000 );
3129
    zSig = ( aSig + bSig )<<1;
3130
    --zExp;
3131
    if ( (int64_t) zSig < 0 ) {
3132
        zSig = aSig + bSig;
3133
        ++zExp;
3134
    }
3135
 roundAndPack:
3136
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3137

    
3138
}
3139

    
3140
/*----------------------------------------------------------------------------
3141
| Returns the result of subtracting the absolute values of the double-
3142
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3143
| difference is negated before being returned.  `zSign' is ignored if the
3144
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3145
| Standard for Binary Floating-Point Arithmetic.
3146
*----------------------------------------------------------------------------*/
3147

    
3148
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3149
{
3150
    int16 aExp, bExp, zExp;
3151
    uint64_t aSig, bSig, zSig;
3152
    int16 expDiff;
3153

    
3154
    aSig = extractFloat64Frac( a );
3155
    aExp = extractFloat64Exp( a );
3156
    bSig = extractFloat64Frac( b );
3157
    bExp = extractFloat64Exp( b );
3158
    expDiff = aExp - bExp;
3159
    aSig <<= 10;
3160
    bSig <<= 10;
3161
    if ( 0 < expDiff ) goto aExpBigger;
3162
    if ( expDiff < 0 ) goto bExpBigger;
3163
    if ( aExp == 0x7FF ) {
3164
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3165
        float_raise( float_flag_invalid STATUS_VAR);
3166
        return float64_default_nan;
3167
    }
3168
    if ( aExp == 0 ) {
3169
        aExp = 1;
3170
        bExp = 1;
3171
    }
3172
    if ( bSig < aSig ) goto aBigger;
3173
    if ( aSig < bSig ) goto bBigger;
3174
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3175
 bExpBigger:
3176
    if ( bExp == 0x7FF ) {
3177
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3179
    }
3180
    if ( aExp == 0 ) {
3181
        ++expDiff;
3182
    }
3183
    else {
3184
        aSig |= LIT64( 0x4000000000000000 );
3185
    }
3186
    shift64RightJamming( aSig, - expDiff, &aSig );
3187
    bSig |= LIT64( 0x4000000000000000 );
3188
 bBigger:
3189
    zSig = bSig - aSig;
3190
    zExp = bExp;
3191
    zSign ^= 1;
3192
    goto normalizeRoundAndPack;
3193
 aExpBigger:
3194
    if ( aExp == 0x7FF ) {
3195
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3196
        return a;
3197
    }
3198
    if ( bExp == 0 ) {
3199
        --expDiff;
3200
    }
3201
    else {
3202
        bSig |= LIT64( 0x4000000000000000 );
3203
    }
3204
    shift64RightJamming( bSig, expDiff, &bSig );
3205
    aSig |= LIT64( 0x4000000000000000 );
3206
 aBigger:
3207
    zSig = aSig - bSig;
3208
    zExp = aExp;
3209
 normalizeRoundAndPack:
3210
    --zExp;
3211
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3212

    
3213
}
3214

    
3215
/*----------------------------------------------------------------------------
3216
| Returns the result of adding the double-precision floating-point values `a'
3217
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3218
| Binary Floating-Point Arithmetic.
3219
*----------------------------------------------------------------------------*/
3220

    
3221
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3222
{
3223
    flag aSign, bSign;
3224
    a = float64_squash_input_denormal(a STATUS_VAR);
3225
    b = float64_squash_input_denormal(b STATUS_VAR);
3226

    
3227
    aSign = extractFloat64Sign( a );
3228
    bSign = extractFloat64Sign( b );
3229
    if ( aSign == bSign ) {
3230
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3231
    }
3232
    else {
3233
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3234
    }
3235

    
3236
}
3237

    
3238
/*----------------------------------------------------------------------------
3239
| Returns the result of subtracting the double-precision floating-point values
3240
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3241
| for Binary Floating-Point Arithmetic.
3242
*----------------------------------------------------------------------------*/
3243

    
3244
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3245
{
3246
    flag aSign, bSign;
3247
    a = float64_squash_input_denormal(a STATUS_VAR);
3248
    b = float64_squash_input_denormal(b STATUS_VAR);
3249

    
3250
    aSign = extractFloat64Sign( a );
3251
    bSign = extractFloat64Sign( b );
3252
    if ( aSign == bSign ) {
3253
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3254
    }
3255
    else {
3256
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3257
    }
3258

    
3259
}
3260

    
3261
/*----------------------------------------------------------------------------
3262
| Returns the result of multiplying the double-precision floating-point values
3263
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3264
| for Binary Floating-Point Arithmetic.
3265
*----------------------------------------------------------------------------*/
3266

    
3267
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3268
{
3269
    flag aSign, bSign, zSign;
3270
    int16 aExp, bExp, zExp;
3271
    uint64_t aSig, bSig, zSig0, zSig1;
3272

    
3273
    a = float64_squash_input_denormal(a STATUS_VAR);
3274
    b = float64_squash_input_denormal(b STATUS_VAR);
3275

    
3276
    aSig = extractFloat64Frac( a );
3277
    aExp = extractFloat64Exp( a );
3278
    aSign = extractFloat64Sign( a );
3279
    bSig = extractFloat64Frac( b );
3280
    bExp = extractFloat64Exp( b );
3281
    bSign = extractFloat64Sign( b );
3282
    zSign = aSign ^ bSign;
3283
    if ( aExp == 0x7FF ) {
3284
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3285
            return propagateFloat64NaN( a, b STATUS_VAR );
3286
        }
3287
        if ( ( bExp | bSig ) == 0 ) {
3288
            float_raise( float_flag_invalid STATUS_VAR);
3289
            return float64_default_nan;
3290
        }
3291
        return packFloat64( zSign, 0x7FF, 0 );
3292
    }
3293
    if ( bExp == 0x7FF ) {
3294
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3295
        if ( ( aExp | aSig ) == 0 ) {
3296
            float_raise( float_flag_invalid STATUS_VAR);
3297
            return float64_default_nan;
3298
        }
3299
        return packFloat64( zSign, 0x7FF, 0 );
3300
    }
3301
    if ( aExp == 0 ) {
3302
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3303
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3304
    }
3305
    if ( bExp == 0 ) {
3306
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3307
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3308
    }
3309
    zExp = aExp + bExp - 0x3FF;
3310
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3311
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3312
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3313
    zSig0 |= ( zSig1 != 0 );
3314
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3315
        zSig0 <<= 1;
3316
        --zExp;
3317
    }
3318
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3319

    
3320
}
3321

    
3322
/*----------------------------------------------------------------------------
3323
| Returns the result of dividing the double-precision floating-point value `a'
3324
| by the corresponding value `b'.  The operation is performed according to
3325
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3326
*----------------------------------------------------------------------------*/
3327

    
3328
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3329
{
3330
    flag aSign, bSign, zSign;
3331
    int16 aExp, bExp, zExp;
3332
    uint64_t aSig, bSig, zSig;
3333
    uint64_t rem0, rem1;
3334
    uint64_t term0, term1;
3335
    a = float64_squash_input_denormal(a STATUS_VAR);
3336
    b = float64_squash_input_denormal(b STATUS_VAR);
3337

    
3338
    aSig = extractFloat64Frac( a );
3339
    aExp = extractFloat64Exp( a );
3340
    aSign = extractFloat64Sign( a );
3341
    bSig = extractFloat64Frac( b );
3342
    bExp = extractFloat64Exp( b );
3343
    bSign = extractFloat64Sign( b );
3344
    zSign = aSign ^ bSign;
3345
    if ( aExp == 0x7FF ) {
3346
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3347
        if ( bExp == 0x7FF ) {
3348
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3349
            float_raise( float_flag_invalid STATUS_VAR);
3350
            return float64_default_nan;
3351
        }
3352
        return packFloat64( zSign, 0x7FF, 0 );
3353
    }
3354
    if ( bExp == 0x7FF ) {
3355
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3356
        return packFloat64( zSign, 0, 0 );
3357
    }
3358
    if ( bExp == 0 ) {
3359
        if ( bSig == 0 ) {
3360
            if ( ( aExp | aSig ) == 0 ) {
3361
                float_raise( float_flag_invalid STATUS_VAR);
3362
                return float64_default_nan;
3363
            }
3364
            float_raise( float_flag_divbyzero STATUS_VAR);
3365
            return packFloat64( zSign, 0x7FF, 0 );
3366
        }
3367
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3368
    }
3369
    if ( aExp == 0 ) {
3370
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3371
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3372
    }
3373
    zExp = aExp - bExp + 0x3FD;
3374
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3375
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3376
    if ( bSig <= ( aSig + aSig ) ) {
3377
        aSig >>= 1;
3378
        ++zExp;
3379
    }
3380
    zSig = estimateDiv128To64( aSig, 0, bSig );
3381
    if ( ( zSig & 0x1FF ) <= 2 ) {
3382
        mul64To128( bSig, zSig, &term0, &term1 );
3383
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3384
        while ( (int64_t) rem0 < 0 ) {
3385
            --zSig;
3386
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3387
        }
3388
        zSig |= ( rem1 != 0 );
3389
    }
3390
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3391

    
3392
}
3393

    
3394
/*----------------------------------------------------------------------------
3395
| Returns the remainder of the double-precision floating-point value `a'
3396
| with respect to the corresponding value `b'.  The operation is performed
3397
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398
*----------------------------------------------------------------------------*/
3399

    
3400
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3401
{
3402
    flag aSign, zSign;
3403
    int16 aExp, bExp, expDiff;
3404
    uint64_t aSig, bSig;
3405
    uint64_t q, alternateASig;
3406
    int64_t sigMean;
3407

    
3408
    a = float64_squash_input_denormal(a STATUS_VAR);
3409
    b = float64_squash_input_denormal(b STATUS_VAR);
3410
    aSig = extractFloat64Frac( a );
3411
    aExp = extractFloat64Exp( a );
3412
    aSign = extractFloat64Sign( a );
3413
    bSig = extractFloat64Frac( b );
3414
    bExp = extractFloat64Exp( b );
3415
    if ( aExp == 0x7FF ) {
3416
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3417
            return propagateFloat64NaN( a, b STATUS_VAR );
3418
        }
3419
        float_raise( float_flag_invalid STATUS_VAR);
3420
        return float64_default_nan;
3421
    }
3422
    if ( bExp == 0x7FF ) {
3423
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3424
        return a;
3425
    }
3426
    if ( bExp == 0 ) {
3427
        if ( bSig == 0 ) {
3428
            float_raise( float_flag_invalid STATUS_VAR);
3429
            return float64_default_nan;
3430
        }
3431
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3432
    }
3433
    if ( aExp == 0 ) {
3434
        if ( aSig == 0 ) return a;
3435
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3436
    }
3437
    expDiff = aExp - bExp;
3438
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3439
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3440
    if ( expDiff < 0 ) {
3441
        if ( expDiff < -1 ) return a;
3442
        aSig >>= 1;
3443
    }
3444
    q = ( bSig <= aSig );
3445
    if ( q ) aSig -= bSig;
3446
    expDiff -= 64;
3447
    while ( 0 < expDiff ) {
3448
        q = estimateDiv128To64( aSig, 0, bSig );
3449
        q = ( 2 < q ) ? q - 2 : 0;
3450
        aSig = - ( ( bSig>>2 ) * q );
3451
        expDiff -= 62;
3452
    }
3453
    expDiff += 64;
3454
    if ( 0 < expDiff ) {
3455
        q = estimateDiv128To64( aSig, 0, bSig );
3456
        q = ( 2 < q ) ? q - 2 : 0;
3457
        q >>= 64 - expDiff;
3458
        bSig >>= 2;
3459
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3460
    }
3461
    else {
3462
        aSig >>= 2;
3463
        bSig >>= 2;
3464
    }
3465
    do {
3466
        alternateASig = aSig;
3467
        ++q;
3468
        aSig -= bSig;
3469
    } while ( 0 <= (int64_t) aSig );
3470
    sigMean = aSig + alternateASig;
3471
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3472
        aSig = alternateASig;
3473
    }
3474
    zSign = ( (int64_t) aSig < 0 );
3475
    if ( zSign ) aSig = - aSig;
3476
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3477

    
3478
}
3479

    
3480
/*----------------------------------------------------------------------------
3481
| Returns the square root of the double-precision floating-point value `a'.
3482
| The operation is performed according to the IEC/IEEE Standard for Binary
3483
| Floating-Point Arithmetic.
3484
*----------------------------------------------------------------------------*/
3485

    
3486
float64 float64_sqrt( float64 a STATUS_PARAM )
3487
{
3488
    flag aSign;
3489
    int16 aExp, zExp;
3490
    uint64_t aSig, zSig, doubleZSig;
3491
    uint64_t rem0, rem1, term0, term1;
3492
    a = float64_squash_input_denormal(a STATUS_VAR);
3493

    
3494
    aSig = extractFloat64Frac( a );
3495
    aExp = extractFloat64Exp( a );
3496
    aSign = extractFloat64Sign( a );
3497
    if ( aExp == 0x7FF ) {
3498
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3499
        if ( ! aSign ) return a;
3500
        float_raise( float_flag_invalid STATUS_VAR);
3501
        return float64_default_nan;
3502
    }
3503
    if ( aSign ) {
3504
        if ( ( aExp | aSig ) == 0 ) return a;
3505
        float_raise( float_flag_invalid STATUS_VAR);
3506
        return float64_default_nan;
3507
    }
3508
    if ( aExp == 0 ) {
3509
        if ( aSig == 0 ) return float64_zero;
3510
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3511
    }
3512
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3513
    aSig |= LIT64( 0x0010000000000000 );
3514
    zSig = estimateSqrt32( aExp, aSig>>21 );
3515
    aSig <<= 9 - ( aExp & 1 );
3516
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3517
    if ( ( zSig & 0x1FF ) <= 5 ) {
3518
        doubleZSig = zSig<<1;
3519
        mul64To128( zSig, zSig, &term0, &term1 );
3520
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3521
        while ( (int64_t) rem0 < 0 ) {
3522
            --zSig;
3523
            doubleZSig -= 2;
3524
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3525
        }
3526
        zSig |= ( ( rem0 | rem1 ) != 0 );
3527
    }
3528
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3529

    
3530
}
3531

    
3532
/*----------------------------------------------------------------------------
3533
| Returns the binary log of the double-precision floating-point value `a'.
3534
| The operation is performed according to the IEC/IEEE Standard for Binary
3535
| Floating-Point Arithmetic.
3536
*----------------------------------------------------------------------------*/
3537
float64 float64_log2( float64 a STATUS_PARAM )
3538
{
3539
    flag aSign, zSign;
3540
    int16 aExp;
3541
    uint64_t aSig, aSig0, aSig1, zSig, i;
3542
    a = float64_squash_input_denormal(a STATUS_VAR);
3543

    
3544
    aSig = extractFloat64Frac( a );
3545
    aExp = extractFloat64Exp( a );
3546
    aSign = extractFloat64Sign( a );
3547

    
3548
    if ( aExp == 0 ) {
3549
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3550
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3551
    }
3552
    if ( aSign ) {
3553
        float_raise( float_flag_invalid STATUS_VAR);
3554
        return float64_default_nan;
3555
    }
3556
    if ( aExp == 0x7FF ) {
3557
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3558
        return a;
3559
    }
3560

    
3561
    aExp -= 0x3FF;
3562
    aSig |= LIT64( 0x0010000000000000 );
3563
    zSign = aExp < 0;
3564
    zSig = (uint64_t)aExp << 52;
3565
    for (i = 1LL << 51; i > 0; i >>= 1) {
3566
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3567
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3568
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3569
            aSig >>= 1;
3570
            zSig |= i;
3571
        }
3572
    }
3573

    
3574
    if ( zSign )
3575
        zSig = -zSig;
3576
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3577
}
3578

    
3579
/*----------------------------------------------------------------------------
3580
| Returns 1 if the double-precision floating-point value `a' is equal to the
3581
| corresponding value `b', and 0 otherwise.  The comparison is performed
3582
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3583
*----------------------------------------------------------------------------*/
3584

    
3585
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3586
{
3587
    uint64_t av, bv;
3588
    a = float64_squash_input_denormal(a STATUS_VAR);
3589
    b = float64_squash_input_denormal(b STATUS_VAR);
3590

    
3591
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3592
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3593
       ) {
3594
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3595
            float_raise( float_flag_invalid STATUS_VAR);
3596
        }
3597
        return 0;
3598
    }
3599
    av = float64_val(a);
3600
    bv = float64_val(b);
3601
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3602

    
3603
}
3604

    
3605
/*----------------------------------------------------------------------------
3606
| Returns 1 if the double-precision floating-point value `a' is less than or
3607
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3608
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3609
| Arithmetic.
3610
*----------------------------------------------------------------------------*/
3611

    
3612
int float64_le( float64 a, float64 b STATUS_PARAM )
3613
{
3614
    flag aSign, bSign;
3615
    uint64_t av, bv;
3616
    a = float64_squash_input_denormal(a STATUS_VAR);
3617
    b = float64_squash_input_denormal(b STATUS_VAR);
3618

    
3619
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3620
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3621
       ) {
3622
        float_raise( float_flag_invalid STATUS_VAR);
3623
        return 0;
3624
    }
3625
    aSign = extractFloat64Sign( a );
3626
    bSign = extractFloat64Sign( b );
3627
    av = float64_val(a);
3628
    bv = float64_val(b);
3629
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3630
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3631

    
3632
}
3633

    
3634
/*----------------------------------------------------------------------------
3635
| Returns 1 if the double-precision floating-point value `a' is less than
3636
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3637
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3638
*----------------------------------------------------------------------------*/
3639

    
3640
int float64_lt( float64 a, float64 b STATUS_PARAM )
3641
{
3642
    flag aSign, bSign;
3643
    uint64_t av, bv;
3644

    
3645
    a = float64_squash_input_denormal(a STATUS_VAR);
3646
    b = float64_squash_input_denormal(b STATUS_VAR);
3647
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3648
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3649
       ) {
3650
        float_raise( float_flag_invalid STATUS_VAR);
3651
        return 0;
3652
    }
3653
    aSign = extractFloat64Sign( a );
3654
    bSign = extractFloat64Sign( b );
3655
    av = float64_val(a);
3656
    bv = float64_val(b);
3657
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3658
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3659

    
3660
}
3661

    
3662
/*----------------------------------------------------------------------------
3663
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3664
| be compared, and 0 otherwise.  The comparison is performed according to the
3665
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3666
*----------------------------------------------------------------------------*/
3667

    
3668
int float64_unordered( float64 a, float64 b STATUS_PARAM )
3669
{
3670
    a = float64_squash_input_denormal(a STATUS_VAR);
3671
    b = float64_squash_input_denormal(b STATUS_VAR);
3672

    
3673
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3674
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3675
       ) {
3676
        float_raise( float_flag_invalid STATUS_VAR);
3677
        return 1;
3678
    }
3679
    return 0;
3680
}
3681

    
3682
/*----------------------------------------------------------------------------
3683
| Returns 1 if the double-precision floating-point value `a' is equal to the
3684
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3685
| if either operand is a NaN.  Otherwise, the comparison is performed
3686
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3687
*----------------------------------------------------------------------------*/
3688

    
3689
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3690
{
3691
    uint64_t av, bv;
3692
    a = float64_squash_input_denormal(a STATUS_VAR);
3693
    b = float64_squash_input_denormal(b STATUS_VAR);
3694

    
3695
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3696
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3697
       ) {
3698
        float_raise( float_flag_invalid STATUS_VAR);
3699
        return 0;
3700
    }
3701
    av = float64_val(a);
3702
    bv = float64_val(b);
3703
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3704

    
3705
}
3706

    
3707
/*----------------------------------------------------------------------------
3708
| Returns 1 if the double-precision floating-point value `a' is less than or
3709
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3710
| cause an exception.  Otherwise, the comparison is performed according to the
3711
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712
*----------------------------------------------------------------------------*/
3713

    
3714
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3715
{
3716
    flag aSign, bSign;
3717
    uint64_t av, bv;
3718
    a = float64_squash_input_denormal(a STATUS_VAR);
3719
    b = float64_squash_input_denormal(b STATUS_VAR);
3720

    
3721
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3722
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3723
       ) {
3724
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3725
            float_raise( float_flag_invalid STATUS_VAR);
3726
        }
3727
        return 0;
3728
    }
3729
    aSign = extractFloat64Sign( a );
3730
    bSign = extractFloat64Sign( b );
3731
    av = float64_val(a);
3732
    bv = float64_val(b);
3733
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3734
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3735

    
3736
}
3737

    
3738
/*----------------------------------------------------------------------------
3739
| Returns 1 if the double-precision floating-point value `a' is less than
3740
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3741
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3742
| Standard for Binary Floating-Point Arithmetic.
3743
*----------------------------------------------------------------------------*/
3744

    
3745
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3746
{
3747
    flag aSign, bSign;
3748
    uint64_t av, bv;
3749
    a = float64_squash_input_denormal(a STATUS_VAR);
3750
    b = float64_squash_input_denormal(b STATUS_VAR);
3751

    
3752
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3753
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3754
       ) {
3755
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3756
            float_raise( float_flag_invalid STATUS_VAR);
3757
        }
3758
        return 0;
3759
    }
3760
    aSign = extractFloat64Sign( a );
3761
    bSign = extractFloat64Sign( b );
3762
    av = float64_val(a);
3763
    bv = float64_val(b);
3764
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3765
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3766

    
3767
}
3768

    
3769
/*----------------------------------------------------------------------------
3770
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3771
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3772
| comparison is performed according to the IEC/IEEE Standard for Binary
3773
| Floating-Point Arithmetic.
3774
*----------------------------------------------------------------------------*/
3775

    
3776
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3777
{
3778
    a = float64_squash_input_denormal(a STATUS_VAR);
3779
    b = float64_squash_input_denormal(b STATUS_VAR);
3780

    
3781
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3782
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3783
       ) {
3784
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3785
            float_raise( float_flag_invalid STATUS_VAR);
3786
        }
3787
        return 1;
3788
    }
3789
    return 0;
3790
}
3791

    
3792
#ifdef FLOATX80
3793

    
3794
/*----------------------------------------------------------------------------
3795
| Returns the result of converting the extended double-precision floating-
3796
| point value `a' to the 32-bit two's complement integer format.  The
3797
| conversion is performed according to the IEC/IEEE Standard for Binary
3798
| Floating-Point Arithmetic---which means in particular that the conversion
3799
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3800
| largest positive integer is returned.  Otherwise, if the conversion
3801
| overflows, the largest integer with the same sign as `a' is returned.
3802
*----------------------------------------------------------------------------*/
3803

    
3804
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3805
{
3806
    flag aSign;
3807
    int32 aExp, shiftCount;
3808
    uint64_t aSig;
3809

    
3810
    aSig = extractFloatx80Frac( a );
3811
    aExp = extractFloatx80Exp( a );
3812
    aSign = extractFloatx80Sign( a );
3813
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3814
    shiftCount = 0x4037 - aExp;
3815
    if ( shiftCount <= 0 ) shiftCount = 1;
3816
    shift64RightJamming( aSig, shiftCount, &aSig );
3817
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3818

    
3819
}
3820

    
3821
/*----------------------------------------------------------------------------
3822
| Returns the result of converting the extended double-precision floating-
3823
| point value `a' to the 32-bit two's complement integer format.  The
3824
| conversion is performed according to the IEC/IEEE Standard for Binary
3825
| Floating-Point Arithmetic, except that the conversion is always rounded
3826
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3827
| Otherwise, if the conversion overflows, the largest integer with the same
3828
| sign as `a' is returned.
3829
*----------------------------------------------------------------------------*/
3830

    
3831
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3832
{
3833
    flag aSign;
3834
    int32 aExp, shiftCount;
3835
    uint64_t aSig, savedASig;
3836
    int32 z;
3837

    
3838
    aSig = extractFloatx80Frac( a );
3839
    aExp = extractFloatx80Exp( a );
3840
    aSign = extractFloatx80Sign( a );
3841
    if ( 0x401E < aExp ) {
3842
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3843
        goto invalid;
3844
    }
3845
    else if ( aExp < 0x3FFF ) {
3846
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3847
        return 0;
3848
    }
3849
    shiftCount = 0x403E - aExp;
3850
    savedASig = aSig;
3851
    aSig >>= shiftCount;
3852
    z = aSig;
3853
    if ( aSign ) z = - z;
3854
    if ( ( z < 0 ) ^ aSign ) {
3855
 invalid:
3856
        float_raise( float_flag_invalid STATUS_VAR);
3857
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3858
    }
3859
    if ( ( aSig<<shiftCount ) != savedASig ) {
3860
        STATUS(float_exception_flags) |= float_flag_inexact;
3861
    }
3862
    return z;
3863

    
3864
}
3865

    
3866
/*----------------------------------------------------------------------------
3867
| Returns the result of converting the extended double-precision floating-
3868
| point value `a' to the 64-bit two's complement integer format.  The
3869
| conversion is performed according to the IEC/IEEE Standard for Binary
3870
| Floating-Point Arithmetic---which means in particular that the conversion
3871
| is rounded according to the current rounding mode.  If `a' is a NaN,
3872
| the largest positive integer is returned.  Otherwise, if the conversion
3873
| overflows, the largest integer with the same sign as `a' is returned.
3874
*----------------------------------------------------------------------------*/
3875

    
3876
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3877
{
3878
    flag aSign;
3879
    int32 aExp, shiftCount;
3880
    uint64_t aSig, aSigExtra;
3881

    
3882
    aSig = extractFloatx80Frac( a );
3883
    aExp = extractFloatx80Exp( a );
3884
    aSign = extractFloatx80Sign( a );
3885
    shiftCount = 0x403E - aExp;
3886
    if ( shiftCount <= 0 ) {
3887
        if ( shiftCount ) {
3888
            float_raise( float_flag_invalid STATUS_VAR);
3889
            if (    ! aSign
3890
                 || (    ( aExp == 0x7FFF )
3891
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3892
               ) {
3893
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3894
            }
3895
            return (int64_t) LIT64( 0x8000000000000000 );
3896
        }
3897
        aSigExtra = 0;
3898
    }
3899
    else {
3900
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3901
    }
3902
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3903

    
3904
}
3905

    
3906
/*----------------------------------------------------------------------------
3907
| Returns the result of converting the extended double-precision floating-
3908
| point value `a' to the 64-bit two's complement integer format.  The
3909
| conversion is performed according to the IEC/IEEE Standard for Binary
3910
| Floating-Point Arithmetic, except that the conversion is always rounded
3911
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3912
| Otherwise, if the conversion overflows, the largest integer with the same
3913
| sign as `a' is returned.
3914
*----------------------------------------------------------------------------*/
3915

    
3916
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3917
{
3918
    flag aSign;
3919
    int32 aExp, shiftCount;
3920
    uint64_t aSig;
3921
    int64 z;
3922

    
3923
    aSig = extractFloatx80Frac( a );
3924
    aExp = extractFloatx80Exp( a );
3925
    aSign = extractFloatx80Sign( a );
3926
    shiftCount = aExp - 0x403E;
3927
    if ( 0 <= shiftCount ) {
3928
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3929
        if ( ( a.high != 0xC03E ) || aSig ) {
3930
            float_raise( float_flag_invalid STATUS_VAR);
3931
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3932
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3933
            }
3934
        }
3935
        return (int64_t) LIT64( 0x8000000000000000 );
3936
    }
3937
    else if ( aExp < 0x3FFF ) {
3938
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3939
        return 0;
3940
    }
3941
    z = aSig>>( - shiftCount );
3942
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3943
        STATUS(float_exception_flags) |= float_flag_inexact;
3944
    }
3945
    if ( aSign ) z = - z;
3946
    return z;
3947

    
3948
}
3949

    
3950
/*----------------------------------------------------------------------------
3951
| Returns the result of converting the extended double-precision floating-
3952
| point value `a' to the single-precision floating-point format.  The
3953
| conversion is performed according to the IEC/IEEE Standard for Binary
3954
| Floating-Point Arithmetic.
3955
*----------------------------------------------------------------------------*/
3956

    
3957
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3958
{
3959
    flag aSign;
3960
    int32 aExp;
3961
    uint64_t aSig;
3962

    
3963
    aSig = extractFloatx80Frac( a );
3964
    aExp = extractFloatx80Exp( a );
3965
    aSign = extractFloatx80Sign( a );
3966
    if ( aExp == 0x7FFF ) {
3967
        if ( (uint64_t) ( aSig<<1 ) ) {
3968
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3969
        }
3970
        return packFloat32( aSign, 0xFF, 0 );
3971
    }
3972
    shift64RightJamming( aSig, 33, &aSig );
3973
    if ( aExp || aSig ) aExp -= 0x3F81;
3974
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3975

    
3976
}
3977

    
3978
/*----------------------------------------------------------------------------
3979
| Returns the result of converting the extended double-precision floating-
3980
| point value `a' to the double-precision floating-point format.  The
3981
| conversion is performed according to the IEC/IEEE Standard for Binary
3982
| Floating-Point Arithmetic.
3983
*----------------------------------------------------------------------------*/
3984

    
3985
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3986
{
3987
    flag aSign;
3988
    int32 aExp;
3989
    uint64_t aSig, zSig;
3990

    
3991
    aSig = extractFloatx80Frac( a );
3992
    aExp = extractFloatx80Exp( a );
3993
    aSign = extractFloatx80Sign( a );
3994
    if ( aExp == 0x7FFF ) {
3995
        if ( (uint64_t) ( aSig<<1 ) ) {
3996
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3997
        }
3998
        return packFloat64( aSign, 0x7FF, 0 );
3999
    }
4000
    shift64RightJamming( aSig, 1, &zSig );
4001
    if ( aExp || aSig ) aExp -= 0x3C01;
4002
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4003

    
4004
}
4005

    
4006
#ifdef FLOAT128
4007

    
4008
/*----------------------------------------------------------------------------
4009
| Returns the result of converting the extended double-precision floating-
4010
| point value `a' to the quadruple-precision floating-point format.  The
4011
| conversion is performed according to the IEC/IEEE Standard for Binary
4012
| Floating-Point Arithmetic.
4013
*----------------------------------------------------------------------------*/
4014

    
4015
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4016
{
4017
    flag aSign;
4018
    int16 aExp;
4019
    uint64_t aSig, zSig0, zSig1;
4020

    
4021
    aSig = extractFloatx80Frac( a );
4022
    aExp = extractFloatx80Exp( a );
4023
    aSign = extractFloatx80Sign( a );
4024
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4025
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4026
    }
4027
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4028
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4029

    
4030
}
4031

    
4032
#endif
4033

    
4034
/*----------------------------------------------------------------------------
4035
| Rounds the extended double-precision floating-point value `a' to an integer,
4036
| and returns the result as an extended quadruple-precision floating-point
4037
| value.  The operation is performed according to the IEC/IEEE Standard for
4038
| Binary Floating-Point Arithmetic.
4039
*----------------------------------------------------------------------------*/
4040

    
4041
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4042
{
4043
    flag aSign;
4044
    int32 aExp;
4045
    uint64_t lastBitMask, roundBitsMask;
4046
    int8 roundingMode;
4047
    floatx80 z;
4048

    
4049
    aExp = extractFloatx80Exp( a );
4050
    if ( 0x403E <= aExp ) {
4051
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4052
            return propagateFloatx80NaN( a, a STATUS_VAR );
4053
        }
4054
        return a;
4055
    }
4056
    if ( aExp < 0x3FFF ) {
4057
        if (    ( aExp == 0 )
4058
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4059
            return a;
4060
        }
4061
        STATUS(float_exception_flags) |= float_flag_inexact;
4062
        aSign = extractFloatx80Sign( a );
4063
        switch ( STATUS(float_rounding_mode) ) {
4064
         case float_round_nearest_even:
4065
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4066
               ) {
4067
                return
4068
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4069
            }
4070
            break;
4071
         case float_round_down:
4072
            return
4073
                  aSign ?
4074
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4075
                : packFloatx80( 0, 0, 0 );
4076
         case float_round_up:
4077
            return
4078
                  aSign ? packFloatx80( 1, 0, 0 )
4079
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4080
        }
4081
        return packFloatx80( aSign, 0, 0 );
4082
    }
4083
    lastBitMask = 1;
4084
    lastBitMask <<= 0x403E - aExp;
4085
    roundBitsMask = lastBitMask - 1;
4086
    z = a;
4087
    roundingMode = STATUS(float_rounding_mode);
4088
    if ( roundingMode == float_round_nearest_even ) {
4089
        z.low += lastBitMask>>1;
4090
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4091
    }
4092
    else if ( roundingMode != float_round_to_zero ) {
4093
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4094
            z.low += roundBitsMask;
4095
        }
4096
    }
4097
    z.low &= ~ roundBitsMask;
4098
    if ( z.low == 0 ) {
4099
        ++z.high;
4100
        z.low = LIT64( 0x8000000000000000 );
4101
    }
4102
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4103
    return z;
4104

    
4105
}
4106

    
4107
/*----------------------------------------------------------------------------
4108
| Returns the result of adding the absolute values of the extended double-
4109
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4110
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4111
| The addition is performed according to the IEC/IEEE Standard for Binary
4112
| Floating-Point Arithmetic.
4113
*----------------------------------------------------------------------------*/
4114

    
4115
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4116
{
4117
    int32 aExp, bExp, zExp;
4118
    uint64_t aSig, bSig, zSig0, zSig1;
4119
    int32 expDiff;
4120

    
4121
    aSig = extractFloatx80Frac( a );
4122
    aExp = extractFloatx80Exp( a );
4123
    bSig = extractFloatx80Frac( b );
4124
    bExp = extractFloatx80Exp( b );
4125
    expDiff = aExp - bExp;
4126
    if ( 0 < expDiff ) {
4127
        if ( aExp == 0x7FFF ) {
4128
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4129
            return a;
4130
        }
4131
        if ( bExp == 0 ) --expDiff;
4132
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4133
        zExp = aExp;
4134
    }
4135
    else if ( expDiff < 0 ) {
4136
        if ( bExp == 0x7FFF ) {
4137
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4138
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4139
        }
4140
        if ( aExp == 0 ) ++expDiff;
4141
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4142
        zExp = bExp;
4143
    }
4144
    else {
4145
        if ( aExp == 0x7FFF ) {
4146
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4147
                return propagateFloatx80NaN( a, b STATUS_VAR );
4148
            }
4149
            return a;
4150
        }
4151
        zSig1 = 0;
4152
        zSig0 = aSig + bSig;
4153
        if ( aExp == 0 ) {
4154
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4155
            goto roundAndPack;
4156
        }
4157
        zExp = aExp;
4158
        goto shiftRight1;
4159
    }
4160
    zSig0 = aSig + bSig;
4161
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4162
 shiftRight1:
4163
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4164
    zSig0 |= LIT64( 0x8000000000000000 );
4165
    ++zExp;
4166
 roundAndPack:
4167
    return
4168
        roundAndPackFloatx80(
4169
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4170

    
4171
}
4172

    
4173
/*----------------------------------------------------------------------------
4174
| Returns the result of subtracting the absolute values of the extended
4175
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4176
| difference is negated before being returned.  `zSign' is ignored if the
4177
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4178
| Standard for Binary Floating-Point Arithmetic.
4179
*----------------------------------------------------------------------------*/
4180

    
4181
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4182
{
4183
    int32 aExp, bExp, zExp;
4184
    uint64_t aSig, bSig, zSig0, zSig1;
4185
    int32 expDiff;
4186
    floatx80 z;
4187

    
4188
    aSig = extractFloatx80Frac( a );
4189
    aExp = extractFloatx80Exp( a );
4190
    bSig = extractFloatx80Frac( b );
4191
    bExp = extractFloatx80Exp( b );
4192
    expDiff = aExp - bExp;
4193
    if ( 0 < expDiff ) goto aExpBigger;
4194
    if ( expDiff < 0 ) goto bExpBigger;
4195
    if ( aExp == 0x7FFF ) {
4196
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4197
            return propagateFloatx80NaN( a, b STATUS_VAR );
4198
        }
4199
        float_raise( float_flag_invalid STATUS_VAR);
4200
        z.low = floatx80_default_nan_low;
4201
        z.high = floatx80_default_nan_high;
4202
        return z;
4203
    }
4204
    if ( aExp == 0 ) {
4205
        aExp = 1;
4206
        bExp = 1;
4207
    }
4208
    zSig1 = 0;
4209
    if ( bSig < aSig ) goto aBigger;
4210
    if ( aSig < bSig ) goto bBigger;
4211
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4212
 bExpBigger:
4213
    if ( bExp == 0x7FFF ) {
4214
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4215
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4216
    }
4217
    if ( aExp == 0 ) ++expDiff;
4218
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4219
 bBigger:
4220
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4221
    zExp = bExp;
4222
    zSign ^= 1;
4223
    goto normalizeRoundAndPack;
4224
 aExpBigger:
4225
    if ( aExp == 0x7FFF ) {
4226
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4227
        return a;
4228
    }
4229
    if ( bExp == 0 ) --expDiff;
4230
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4231
 aBigger:
4232
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4233
    zExp = aExp;
4234
 normalizeRoundAndPack:
4235
    return
4236
        normalizeRoundAndPackFloatx80(
4237
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4238

    
4239
}
4240

    
4241
/*----------------------------------------------------------------------------
4242
| Returns the result of adding the extended double-precision floating-point
4243
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4244
| Standard for Binary Floating-Point Arithmetic.
4245
*----------------------------------------------------------------------------*/
4246

    
4247
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4248
{
4249
    flag aSign, bSign;
4250

    
4251
    aSign = extractFloatx80Sign( a );
4252
    bSign = extractFloatx80Sign( b );
4253
    if ( aSign == bSign ) {
4254
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4255
    }
4256
    else {
4257
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4258
    }
4259

    
4260
}
4261

    
4262
/*----------------------------------------------------------------------------
4263
| Returns the result of subtracting the extended double-precision floating-
4264
| point values `a' and `b'.  The operation is performed according to the
4265
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266
*----------------------------------------------------------------------------*/
4267

    
4268
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4269
{
4270
    flag aSign, bSign;
4271

    
4272
    aSign = extractFloatx80Sign( a );
4273
    bSign = extractFloatx80Sign( b );
4274
    if ( aSign == bSign ) {
4275
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4276
    }
4277
    else {
4278
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4279
    }
4280

    
4281
}
4282

    
4283
/*----------------------------------------------------------------------------
4284
| Returns the result of multiplying the extended double-precision floating-
4285
| point values `a' and `b'.  The operation is performed according to the
4286
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4287
*----------------------------------------------------------------------------*/
4288

    
4289
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4290
{
4291
    flag aSign, bSign, zSign;
4292
    int32 aExp, bExp, zExp;
4293
    uint64_t aSig, bSig, zSig0, zSig1;
4294
    floatx80 z;
4295

    
4296
    aSig = extractFloatx80Frac( a );
4297
    aExp = extractFloatx80Exp( a );
4298
    aSign = extractFloatx80Sign( a );
4299
    bSig = extractFloatx80Frac( b );
4300
    bExp = extractFloatx80Exp( b );
4301
    bSign = extractFloatx80Sign( b );
4302
    zSign = aSign ^ bSign;
4303
    if ( aExp == 0x7FFF ) {
4304
        if (    (uint64_t) ( aSig<<1 )
4305
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4306
            return propagateFloatx80NaN( a, b STATUS_VAR );
4307
        }
4308
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4309
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4310
    }
4311
    if ( bExp == 0x7FFF ) {
4312
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4313
        if ( ( aExp | aSig ) == 0 ) {
4314
 invalid:
4315
            float_raise( float_flag_invalid STATUS_VAR);
4316
            z.low = floatx80_default_nan_low;
4317
            z.high = floatx80_default_nan_high;
4318
            return z;
4319
        }
4320
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4321
    }
4322
    if ( aExp == 0 ) {
4323
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4324
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4325
    }
4326
    if ( bExp == 0 ) {
4327
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4328
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4329
    }
4330
    zExp = aExp + bExp - 0x3FFE;
4331
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4332
    if ( 0 < (int64_t) zSig0 ) {
4333
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4334
        --zExp;
4335
    }
4336
    return
4337
        roundAndPackFloatx80(
4338
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4339

    
4340
}
4341

    
4342
/*----------------------------------------------------------------------------
4343
| Returns the result of dividing the extended double-precision floating-point
4344
| value `a' by the corresponding value `b'.  The operation is performed
4345
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4346
*----------------------------------------------------------------------------*/
4347

    
4348
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4349
{
4350
    flag aSign, bSign, zSign;
4351
    int32 aExp, bExp, zExp;
4352
    uint64_t aSig, bSig, zSig0, zSig1;
4353
    uint64_t rem0, rem1, rem2, term0, term1, term2;
4354
    floatx80 z;
4355

    
4356
    aSig = extractFloatx80Frac( a );
4357
    aExp = extractFloatx80Exp( a );
4358
    aSign = extractFloatx80Sign( a );
4359
    bSig = extractFloatx80Frac( b );
4360
    bExp = extractFloatx80Exp( b );
4361
    bSign = extractFloatx80Sign( b );
4362
    zSign = aSign ^ bSign;
4363
    if ( aExp == 0x7FFF ) {
4364
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4365
        if ( bExp == 0x7FFF ) {
4366
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4367
            goto invalid;
4368
        }
4369
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4370
    }
4371
    if ( bExp == 0x7FFF ) {
4372
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4373
        return packFloatx80( zSign, 0, 0 );
4374
    }
4375
    if ( bExp == 0 ) {
4376
        if ( bSig == 0 ) {
4377
            if ( ( aExp | aSig ) == 0 ) {
4378
 invalid:
4379
                float_raise( float_flag_invalid STATUS_VAR);
4380
                z.low = floatx80_default_nan_low;
4381
                z.high = floatx80_default_nan_high;
4382
                return z;
4383
            }
4384
            float_raise( float_flag_divbyzero STATUS_VAR);
4385
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4386
        }
4387
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4388
    }
4389
    if ( aExp == 0 ) {
4390
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4391
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4392
    }
4393
    zExp = aExp - bExp + 0x3FFE;
4394
    rem1 = 0;
4395
    if ( bSig <= aSig ) {
4396
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4397
        ++zExp;
4398
    }
4399
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4400
    mul64To128( bSig, zSig0, &term0, &term1 );
4401
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4402
    while ( (int64_t) rem0 < 0 ) {
4403
        --zSig0;
4404
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4405
    }
4406
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4407
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4408
        mul64To128( bSig, zSig1, &term1, &term2 );
4409
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4410
        while ( (int64_t) rem1 < 0 ) {
4411
            --zSig1;
4412
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4413
        }
4414
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4415
    }
4416
    return
4417
        roundAndPackFloatx80(
4418
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4419

    
4420
}
4421

    
4422
/*----------------------------------------------------------------------------
4423
| Returns the remainder of the extended double-precision floating-point value
4424
| `a' with respect to the corresponding value `b'.  The operation is performed
4425
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4426
*----------------------------------------------------------------------------*/
4427

    
4428
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4429
{
4430
    flag aSign, zSign;
4431
    int32 aExp, bExp, expDiff;
4432
    uint64_t aSig0, aSig1, bSig;
4433
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4434
    floatx80 z;
4435

    
4436
    aSig0 = extractFloatx80Frac( a );
4437
    aExp = extractFloatx80Exp( a );
4438
    aSign = extractFloatx80Sign( a );
4439
    bSig = extractFloatx80Frac( b );
4440
    bExp = extractFloatx80Exp( b );
4441
    if ( aExp == 0x7FFF ) {
4442
        if (    (uint64_t) ( aSig0<<1 )
4443
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4444
            return propagateFloatx80NaN( a, b STATUS_VAR );
4445
        }
4446
        goto invalid;
4447
    }
4448
    if ( bExp == 0x7FFF ) {
4449
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4450
        return a;
4451
    }
4452
    if ( bExp == 0 ) {
4453
        if ( bSig == 0 ) {
4454
 invalid:
4455
            float_raise( float_flag_invalid STATUS_VAR);
4456
            z.low = floatx80_default_nan_low;
4457
            z.high = floatx80_default_nan_high;
4458
            return z;
4459
        }
4460
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4461
    }
4462
    if ( aExp == 0 ) {
4463
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4464
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4465
    }
4466
    bSig |= LIT64( 0x8000000000000000 );
4467
    zSign = aSign;
4468
    expDiff = aExp - bExp;
4469
    aSig1 = 0;
4470
    if ( expDiff < 0 ) {
4471
        if ( expDiff < -1 ) return a;
4472
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4473
        expDiff = 0;
4474
    }
4475
    q = ( bSig <= aSig0 );
4476
    if ( q ) aSig0 -= bSig;
4477
    expDiff -= 64;
4478
    while ( 0 < expDiff ) {
4479
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4480
        q = ( 2 < q ) ? q - 2 : 0;
4481
        mul64To128( bSig, q, &term0, &term1 );
4482
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4483
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4484
        expDiff -= 62;
4485
    }
4486
    expDiff += 64;
4487
    if ( 0 < expDiff ) {
4488
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4489
        q = ( 2 < q ) ? q - 2 : 0;
4490
        q >>= 64 - expDiff;
4491
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4492
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4493
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4494
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4495
            ++q;
4496
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4497
        }
4498
    }
4499
    else {
4500
        term1 = 0;
4501
        term0 = bSig;
4502
    }
4503
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4504
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4505
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4506
              && ( q & 1 ) )
4507
       ) {
4508
        aSig0 = alternateASig0;
4509
        aSig1 = alternateASig1;
4510
        zSign = ! zSign;
4511
    }
4512
    return
4513
        normalizeRoundAndPackFloatx80(
4514
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4515

    
4516
}
4517

    
4518
/*----------------------------------------------------------------------------
4519
| Returns the square root of the extended double-precision floating-point
4520
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4521
| for Binary Floating-Point Arithmetic.
4522
*----------------------------------------------------------------------------*/
4523

    
4524
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4525
{
4526
    flag aSign;
4527
    int32 aExp, zExp;
4528
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4529
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4530
    floatx80 z;
4531

    
4532
    aSig0 = extractFloatx80Frac( a );
4533
    aExp = extractFloatx80Exp( a );
4534
    aSign = extractFloatx80Sign( a );
4535
    if ( aExp == 0x7FFF ) {
4536
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4537
        if ( ! aSign ) return a;
4538
        goto invalid;
4539
    }
4540
    if ( aSign ) {
4541
        if ( ( aExp | aSig0 ) == 0 ) return a;
4542
 invalid:
4543
        float_raise( float_flag_invalid STATUS_VAR);
4544
        z.low = floatx80_default_nan_low;
4545
        z.high = floatx80_default_nan_high;
4546
        return z;
4547
    }
4548
    if ( aExp == 0 ) {
4549
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4550
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4551
    }
4552
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4553
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4554
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4555
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4556
    doubleZSig0 = zSig0<<1;
4557
    mul64To128( zSig0, zSig0, &term0, &term1 );
4558
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4559
    while ( (int64_t) rem0 < 0 ) {
4560
        --zSig0;
4561
        doubleZSig0 -= 2;
4562
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4563
    }
4564
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4565
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4566
        if ( zSig1 == 0 ) zSig1 = 1;
4567
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4568
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4569
        mul64To128( zSig1, zSig1, &term2, &term3 );
4570
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4571
        while ( (int64_t) rem1 < 0 ) {
4572
            --zSig1;
4573
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4574
            term3 |= 1;
4575
            term2 |= doubleZSig0;
4576
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4577
        }
4578
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4579
    }
4580
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4581
    zSig0 |= doubleZSig0;
4582
    return
4583
        roundAndPackFloatx80(
4584
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4585

    
4586
}
4587

    
4588
/*----------------------------------------------------------------------------
4589
| Returns 1 if the extended double-precision floating-point value `a' is
4590
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4591
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4592
| Arithmetic.
4593
*----------------------------------------------------------------------------*/
4594

    
4595
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4596
{
4597

    
4598
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4599
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4600
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4601
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4602
       ) {
4603
        if (    floatx80_is_signaling_nan( a )
4604
             || floatx80_is_signaling_nan( b ) ) {
4605
            float_raise( float_flag_invalid STATUS_VAR);
4606
        }
4607
        return 0;
4608
    }
4609
    return
4610
           ( a.low == b.low )
4611
        && (    ( a.high == b.high )
4612
             || (    ( a.low == 0 )
4613
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4614
           );
4615

    
4616
}
4617

    
4618
/*----------------------------------------------------------------------------
4619
| Returns 1 if the extended double-precision floating-point value `a' is
4620
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4621
| comparison is performed according to the IEC/IEEE Standard for Binary
4622
| Floating-Point Arithmetic.
4623
*----------------------------------------------------------------------------*/
4624

    
4625
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4626
{
4627
    flag aSign, bSign;
4628

    
4629
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4630
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4631
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4632
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4633
       ) {
4634
        float_raise( float_flag_invalid STATUS_VAR);
4635
        return 0;
4636
    }
4637
    aSign = extractFloatx80Sign( a );
4638
    bSign = extractFloatx80Sign( b );
4639
    if ( aSign != bSign ) {
4640
        return
4641
               aSign
4642
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4643
                 == 0 );
4644
    }
4645
    return
4646
          aSign ? le128( b.high, b.low, a.high, a.low )
4647
        : le128( a.high, a.low, b.high, b.low );
4648

    
4649
}
4650

    
4651
/*----------------------------------------------------------------------------
4652
| Returns 1 if the extended double-precision floating-point value `a' is
4653
| less than the corresponding value `b', and 0 otherwise.  The comparison
4654
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4655
| Arithmetic.
4656
*----------------------------------------------------------------------------*/
4657

    
4658
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4659
{
4660
    flag aSign, bSign;
4661

    
4662
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4663
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4664
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4665
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4666
       ) {
4667
        float_raise( float_flag_invalid STATUS_VAR);
4668
        return 0;
4669
    }
4670
    aSign = extractFloatx80Sign( a );
4671
    bSign = extractFloatx80Sign( b );
4672
    if ( aSign != bSign ) {
4673
        return
4674
               aSign
4675
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4676
                 != 0 );
4677
    }
4678
    return
4679
          aSign ? lt128( b.high, b.low, a.high, a.low )
4680
        : lt128( a.high, a.low, b.high, b.low );
4681

    
4682
}
4683

    
4684
/*----------------------------------------------------------------------------
4685
| Returns 1 if the extended double-precision floating-point values `a' and `b'
4686
| cannot be compared, and 0 otherwise.  The comparison is performed according
4687
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4688
*----------------------------------------------------------------------------*/
4689
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4690
{
4691
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4692
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4693
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4694
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4695
       ) {
4696
        float_raise( float_flag_invalid STATUS_VAR);
4697
        return 1;
4698
    }
4699
    return 0;
4700
}
4701

    
4702
/*----------------------------------------------------------------------------
4703
| Returns 1 if the extended double-precision floating-point value `a' is equal
4704
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4705
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4706
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4707
*----------------------------------------------------------------------------*/
4708

    
4709
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4710
{
4711

    
4712
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4713
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4714
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4715
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4716
       ) {
4717
        float_raise( float_flag_invalid STATUS_VAR);
4718
        return 0;
4719
    }
4720
    return
4721
           ( a.low == b.low )
4722
        && (    ( a.high == b.high )
4723
             || (    ( a.low == 0 )
4724
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4725
           );
4726

    
4727
}
4728

    
4729
/*----------------------------------------------------------------------------
4730
| Returns 1 if the extended double-precision floating-point value `a' is less
4731
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4732
| do not cause an exception.  Otherwise, the comparison is performed according
4733
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4734
*----------------------------------------------------------------------------*/
4735

    
4736
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4737
{
4738
    flag aSign, bSign;
4739

    
4740
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4741
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4742
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4743
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4744
       ) {
4745
        if (    floatx80_is_signaling_nan( a )
4746
             || floatx80_is_signaling_nan( b ) ) {
4747
            float_raise( float_flag_invalid STATUS_VAR);
4748
        }
4749
        return 0;
4750
    }
4751
    aSign = extractFloatx80Sign( a );
4752
    bSign = extractFloatx80Sign( b );
4753
    if ( aSign != bSign ) {
4754
        return
4755
               aSign
4756
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4757
                 == 0 );
4758
    }
4759
    return
4760
          aSign ? le128( b.high, b.low, a.high, a.low )
4761
        : le128( a.high, a.low, b.high, b.low );
4762

    
4763
}
4764

    
4765
/*----------------------------------------------------------------------------
4766
| Returns 1 if the extended double-precision floating-point value `a' is less
4767
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4768
| an exception.  Otherwise, the comparison is performed according to the
4769
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4770
*----------------------------------------------------------------------------*/
4771

    
4772
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4773
{
4774
    flag aSign, bSign;
4775

    
4776
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4777
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4778
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4779
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4780
       ) {
4781
        if (    floatx80_is_signaling_nan( a )
4782
             || floatx80_is_signaling_nan( b ) ) {
4783
            float_raise( float_flag_invalid STATUS_VAR);
4784
        }
4785
        return 0;
4786
    }
4787
    aSign = extractFloatx80Sign( a );
4788
    bSign = extractFloatx80Sign( b );
4789
    if ( aSign != bSign ) {
4790
        return
4791
               aSign
4792
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4793
                 != 0 );
4794
    }
4795
    return
4796
          aSign ? lt128( b.high, b.low, a.high, a.low )
4797
        : lt128( a.high, a.low, b.high, b.low );
4798

    
4799
}
4800

    
4801
/*----------------------------------------------------------------------------
4802
| Returns 1 if the extended double-precision floating-point values `a' and `b'
4803
| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4804
| The comparison is performed according to the IEC/IEEE Standard for Binary
4805
| Floating-Point Arithmetic.
4806
*----------------------------------------------------------------------------*/
4807
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4808
{
4809
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4810
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4811
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4812
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4813
       ) {
4814
        if (    floatx80_is_signaling_nan( a )
4815
             || floatx80_is_signaling_nan( b ) ) {
4816
            float_raise( float_flag_invalid STATUS_VAR);
4817
        }
4818
        return 1;
4819
    }
4820
    return 0;
4821
}
4822

    
4823
#endif
4824

    
4825
#ifdef FLOAT128
4826

    
4827
/*----------------------------------------------------------------------------
4828
| Returns the result of converting the quadruple-precision floating-point
4829
| value `a' to the 32-bit two's complement integer format.  The conversion
4830
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4831
| Arithmetic---which means in particular that the conversion is rounded
4832
| according to the current rounding mode.  If `a' is a NaN, the largest
4833
| positive integer is returned.  Otherwise, if the conversion overflows, the
4834
| largest integer with the same sign as `a' is returned.
4835
*----------------------------------------------------------------------------*/
4836

    
4837
int32 float128_to_int32( float128 a STATUS_PARAM )
4838
{
4839
    flag aSign;
4840
    int32 aExp, shiftCount;
4841
    uint64_t aSig0, aSig1;
4842

    
4843
    aSig1 = extractFloat128Frac1( a );
4844
    aSig0 = extractFloat128Frac0( a );
4845
    aExp = extractFloat128Exp( a );
4846
    aSign = extractFloat128Sign( a );
4847
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4848
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4849
    aSig0 |= ( aSig1 != 0 );
4850
    shiftCount = 0x4028 - aExp;
4851
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4852
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4853

    
4854
}
4855

    
4856
/*----------------------------------------------------------------------------
4857
| Returns the result of converting the quadruple-precision floating-point
4858
| value `a' to the 32-bit two's complement integer format.  The conversion
4859
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860
| Arithmetic, except that the conversion is always rounded toward zero.  If
4861
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4862
| conversion overflows, the largest integer with the same sign as `a' is
4863
| returned.
4864
*----------------------------------------------------------------------------*/
4865

    
4866
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4867
{
4868
    flag aSign;
4869
    int32 aExp, shiftCount;
4870
    uint64_t aSig0, aSig1, savedASig;
4871
    int32 z;
4872

    
4873
    aSig1 = extractFloat128Frac1( a );
4874
    aSig0 = extractFloat128Frac0( a );
4875
    aExp = extractFloat128Exp( a );
4876
    aSign = extractFloat128Sign( a );
4877
    aSig0 |= ( aSig1 != 0 );
4878
    if ( 0x401E < aExp ) {
4879
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4880
        goto invalid;
4881
    }
4882
    else if ( aExp < 0x3FFF ) {
4883
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4884
        return 0;
4885
    }
4886
    aSig0 |= LIT64( 0x0001000000000000 );
4887
    shiftCount = 0x402F - aExp;
4888
    savedASig = aSig0;
4889
    aSig0 >>= shiftCount;
4890
    z = aSig0;
4891
    if ( aSign ) z = - z;
4892
    if ( ( z < 0 ) ^ aSign ) {
4893
 invalid:
4894
        float_raise( float_flag_invalid STATUS_VAR);
4895
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4896
    }
4897
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4898
        STATUS(float_exception_flags) |= float_flag_inexact;
4899
    }
4900
    return z;
4901

    
4902
}
4903

    
4904
/*----------------------------------------------------------------------------
4905
| Returns the result of converting the quadruple-precision floating-point
4906
| value `a' to the 64-bit two's complement integer format.  The conversion
4907
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4908
| Arithmetic---which means in particular that the conversion is rounded
4909
| according to the current rounding mode.  If `a' is a NaN, the largest
4910
| positive integer is returned.  Otherwise, if the conversion overflows, the
4911
| largest integer with the same sign as `a' is returned.
4912
*----------------------------------------------------------------------------*/
4913

    
4914
int64 float128_to_int64( float128 a STATUS_PARAM )
4915
{
4916
    flag aSign;
4917
    int32 aExp, shiftCount;
4918
    uint64_t aSig0, aSig1;
4919

    
4920
    aSig1 = extractFloat128Frac1( a );
4921
    aSig0 = extractFloat128Frac0( a );
4922
    aExp = extractFloat128Exp( a );
4923
    aSign = extractFloat128Sign( a );
4924
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4925
    shiftCount = 0x402F - aExp;
4926
    if ( shiftCount <= 0 ) {
4927
        if ( 0x403E < aExp ) {
4928
            float_raise( float_flag_invalid STATUS_VAR);
4929
            if (    ! aSign
4930
                 || (    ( aExp == 0x7FFF )
4931
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4932
                    )
4933
               ) {
4934
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4935
            }
4936
            return (int64_t) LIT64( 0x8000000000000000 );
4937
        }
4938
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4939
    }
4940
    else {
4941
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4942
    }
4943
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4944

    
4945
}
4946

    
4947
/*----------------------------------------------------------------------------
4948
| Returns the result of converting the quadruple-precision floating-point
4949
| value `a' to the 64-bit two's complement integer format.  The conversion
4950
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4951
| Arithmetic, except that the conversion is always rounded toward zero.
4952
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4953
| the conversion overflows, the largest integer with the same sign as `a' is
4954
| returned.
4955
*----------------------------------------------------------------------------*/
4956

    
4957
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4958
{
4959
    flag aSign;
4960
    int32 aExp, shiftCount;
4961
    uint64_t aSig0, aSig1;
4962
    int64 z;
4963

    
4964
    aSig1 = extractFloat128Frac1( a );
4965
    aSig0 = extractFloat128Frac0( a );
4966
    aExp = extractFloat128Exp( a );
4967
    aSign = extractFloat128Sign( a );
4968
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4969
    shiftCount = aExp - 0x402F;
4970
    if ( 0 < shiftCount ) {
4971
        if ( 0x403E <= aExp ) {
4972
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4973
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4974
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4975
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4976
            }
4977
            else {
4978
                float_raise( float_flag_invalid STATUS_VAR);
4979
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4980
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4981
                }
4982
            }
4983
            return (int64_t) LIT64( 0x8000000000000000 );
4984
        }
4985
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4986
        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4987
            STATUS(float_exception_flags) |= float_flag_inexact;
4988
        }
4989
    }
4990
    else {
4991
        if ( aExp < 0x3FFF ) {
4992
            if ( aExp | aSig0 | aSig1 ) {
4993
                STATUS(float_exception_flags) |= float_flag_inexact;
4994
            }
4995
            return 0;
4996
        }
4997
        z = aSig0>>( - shiftCount );
4998
        if (    aSig1
4999
             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5000
            STATUS(float_exception_flags) |= float_flag_inexact;
5001
        }
5002
    }
5003
    if ( aSign ) z = - z;
5004
    return z;
5005

    
5006
}
5007

    
5008
/*----------------------------------------------------------------------------
5009
| Returns the result of converting the quadruple-precision floating-point
5010
| value `a' to the single-precision floating-point format.  The conversion
5011
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5012
| Arithmetic.
5013
*----------------------------------------------------------------------------*/
5014

    
5015
float32 float128_to_float32( float128 a STATUS_PARAM )
5016
{
5017
    flag aSign;
5018
    int32 aExp;
5019
    uint64_t aSig0, aSig1;
5020
    uint32_t zSig;
5021

    
5022
    aSig1 = extractFloat128Frac1( a );
5023
    aSig0 = extractFloat128Frac0( a );
5024
    aExp = extractFloat128Exp( a );
5025
    aSign = extractFloat128Sign( a );
5026
    if ( aExp == 0x7FFF ) {
5027
        if ( aSig0 | aSig1 ) {
5028
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5029
        }
5030
        return packFloat32( aSign, 0xFF, 0 );
5031
    }
5032
    aSig0 |= ( aSig1 != 0 );
5033
    shift64RightJamming( aSig0, 18, &aSig0 );
5034
    zSig = aSig0;
5035
    if ( aExp || zSig ) {
5036
        zSig |= 0x40000000;
5037
        aExp -= 0x3F81;
5038
    }
5039
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5040

    
5041
}
5042

    
5043
/*----------------------------------------------------------------------------
5044
| Returns the result of converting the quadruple-precision floating-point
5045
| value `a' to the double-precision floating-point format.  The conversion
5046
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5047
| Arithmetic.
5048
*----------------------------------------------------------------------------*/
5049

    
5050
float64 float128_to_float64( float128 a STATUS_PARAM )
5051
{
5052
    flag aSign;
5053
    int32 aExp;
5054
    uint64_t aSig0, aSig1;
5055

    
5056
    aSig1 = extractFloat128Frac1( a );
5057
    aSig0 = extractFloat128Frac0( a );
5058
    aExp = extractFloat128Exp( a );
5059
    aSign = extractFloat128Sign( a );
5060
    if ( aExp == 0x7FFF ) {
5061
        if ( aSig0 | aSig1 ) {
5062
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5063
        }
5064
        return packFloat64( aSign, 0x7FF, 0 );
5065
    }
5066
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5067
    aSig0 |= ( aSig1 != 0 );
5068
    if ( aExp || aSig0 ) {
5069
        aSig0 |= LIT64( 0x4000000000000000 );
5070
        aExp -= 0x3C01;
5071
    }
5072
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5073

    
5074
}
5075

    
5076
#ifdef FLOATX80
5077

    
5078
/*----------------------------------------------------------------------------
5079
| Returns the result of converting the quadruple-precision floating-point
5080
| value `a' to the extended double-precision floating-point format.  The
5081
| conversion is performed according to the IEC/IEEE Standard for Binary
5082
| Floating-Point Arithmetic.
5083
*----------------------------------------------------------------------------*/
5084

    
5085
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5086
{
5087
    flag aSign;
5088
    int32 aExp;
5089
    uint64_t aSig0, aSig1;
5090

    
5091
    aSig1 = extractFloat128Frac1( a );
5092
    aSig0 = extractFloat128Frac0( a );
5093
    aExp = extractFloat128Exp( a );
5094
    aSign = extractFloat128Sign( a );
5095
    if ( aExp == 0x7FFF ) {
5096
        if ( aSig0 | aSig1 ) {
5097
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5098
        }
5099
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100
    }
5101
    if ( aExp == 0 ) {
5102
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5103
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5104
    }
5105
    else {
5106
        aSig0 |= LIT64( 0x0001000000000000 );
5107
    }
5108
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5109
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5110

    
5111
}
5112

    
5113
#endif
5114

    
5115
/*----------------------------------------------------------------------------
5116
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5117
| returns the result as a quadruple-precision floating-point value.  The
5118
| operation is performed according to the IEC/IEEE Standard for Binary
5119
| Floating-Point Arithmetic.
5120
*----------------------------------------------------------------------------*/
5121

    
5122
float128 float128_round_to_int( float128 a STATUS_PARAM )
5123
{
5124
    flag aSign;
5125
    int32 aExp;
5126
    uint64_t lastBitMask, roundBitsMask;
5127
    int8 roundingMode;
5128
    float128 z;
5129

    
5130
    aExp = extractFloat128Exp( a );
5131
    if ( 0x402F <= aExp ) {
5132
        if ( 0x406F <= aExp ) {
5133
            if (    ( aExp == 0x7FFF )
5134
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5135
               ) {
5136
                return propagateFloat128NaN( a, a STATUS_VAR );
5137
            }
5138
            return a;
5139
        }
5140
        lastBitMask = 1;
5141
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5142
        roundBitsMask = lastBitMask - 1;
5143
        z = a;
5144
        roundingMode = STATUS(float_rounding_mode);
5145
        if ( roundingMode == float_round_nearest_even ) {
5146
            if ( lastBitMask ) {
5147
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5148
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5149
            }
5150
            else {
5151
                if ( (int64_t) z.low < 0 ) {
5152
                    ++z.high;
5153
                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5154
                }
5155
            }
5156
        }
5157
        else if ( roundingMode != float_round_to_zero ) {
5158
            if (   extractFloat128Sign( z )
5159
                 ^ ( roundingMode == float_round_up ) ) {
5160
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5161
            }
5162
        }
5163
        z.low &= ~ roundBitsMask;
5164
    }
5165
    else {
5166
        if ( aExp < 0x3FFF ) {
5167
            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5168
            STATUS(float_exception_flags) |= float_flag_inexact;
5169
            aSign = extractFloat128Sign( a );
5170
            switch ( STATUS(float_rounding_mode) ) {
5171
             case float_round_nearest_even:
5172
                if (    ( aExp == 0x3FFE )
5173
                     && (   extractFloat128Frac0( a )
5174
                          | extractFloat128Frac1( a ) )
5175
                   ) {
5176
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5177
                }
5178
                break;
5179
             case float_round_down:
5180
                return
5181
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5182
                    : packFloat128( 0, 0, 0, 0 );
5183
             case float_round_up:
5184
                return
5185
                      aSign ? packFloat128( 1, 0, 0, 0 )
5186
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5187
            }
5188
            return packFloat128( aSign, 0, 0, 0 );
5189
        }
5190
        lastBitMask = 1;
5191
        lastBitMask <<= 0x402F - aExp;
5192
        roundBitsMask = lastBitMask - 1;
5193
        z.low = 0;
5194
        z.high = a.high;
5195
        roundingMode = STATUS(float_rounding_mode);
5196
        if ( roundingMode == float_round_nearest_even ) {
5197
            z.high += lastBitMask>>1;
5198
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5199
                z.high &= ~ lastBitMask;
5200
            }
5201
        }
5202
        else if ( roundingMode != float_round_to_zero ) {
5203
            if (   extractFloat128Sign( z )
5204
                 ^ ( roundingMode == float_round_up ) ) {
5205
                z.high |= ( a.low != 0 );
5206
                z.high += roundBitsMask;
5207
            }
5208
        }
5209
        z.high &= ~ roundBitsMask;
5210
    }
5211
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5212
        STATUS(float_exception_flags) |= float_flag_inexact;
5213
    }
5214
    return z;
5215

    
5216
}
5217

    
5218
/*----------------------------------------------------------------------------
5219
| Returns the result of adding the absolute values of the quadruple-precision
5220
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5221
| before being returned.  `zSign' is ignored if the result is a NaN.
5222
| The addition is performed according to the IEC/IEEE Standard for Binary
5223
| Floating-Point Arithmetic.
5224
*----------------------------------------------------------------------------*/
5225

    
5226
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5227
{
5228
    int32 aExp, bExp, zExp;
5229
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5230
    int32 expDiff;
5231

    
5232
    aSig1 = extractFloat128Frac1( a );
5233
    aSig0 = extractFloat128Frac0( a );
5234
    aExp = extractFloat128Exp( a );
5235
    bSig1 = extractFloat128Frac1( b );
5236
    bSig0 = extractFloat128Frac0( b );
5237
    bExp = extractFloat128Exp( b );
5238
    expDiff = aExp - bExp;
5239
    if ( 0 < expDiff ) {
5240
        if ( aExp == 0x7FFF ) {
5241
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5242
            return a;
5243
        }
5244
        if ( bExp == 0 ) {
5245
            --expDiff;
5246
        }
5247
        else {
5248
            bSig0 |= LIT64( 0x0001000000000000 );
5249
        }
5250
        shift128ExtraRightJamming(
5251
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5252
        zExp = aExp;
5253
    }
5254
    else if ( expDiff < 0 ) {
5255
        if ( bExp == 0x7FFF ) {
5256
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5257
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5258
        }
5259
        if ( aExp == 0 ) {
5260
            ++expDiff;
5261
        }
5262
        else {
5263
            aSig0 |= LIT64( 0x0001000000000000 );
5264
        }
5265
        shift128ExtraRightJamming(
5266
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5267
        zExp = bExp;
5268
    }
5269
    else {
5270
        if ( aExp == 0x7FFF ) {
5271
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5272
                return propagateFloat128NaN( a, b STATUS_VAR );
5273
            }
5274
            return a;
5275
        }
5276
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5277
        if ( aExp == 0 ) {
5278
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5279
            return packFloat128( zSign, 0, zSig0, zSig1 );
5280
        }
5281
        zSig2 = 0;
5282
        zSig0 |= LIT64( 0x0002000000000000 );
5283
        zExp = aExp;
5284
        goto shiftRight1;
5285
    }
5286
    aSig0 |= LIT64( 0x0001000000000000 );
5287
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5288
    --zExp;
5289
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5290
    ++zExp;
5291
 shiftRight1:
5292
    shift128ExtraRightJamming(
5293
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5294
 roundAndPack:
5295
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5296

    
5297
}
5298

    
5299
/*----------------------------------------------------------------------------
5300
| Returns the result of subtracting the absolute values of the quadruple-
5301
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5302
| difference is negated before being returned.  `zSign' is ignored if the
5303
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5304
| Standard for Binary Floating-Point Arithmetic.
5305
*----------------------------------------------------------------------------*/
5306

    
5307
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5308
{
5309
    int32 aExp, bExp, zExp;
5310
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5311
    int32 expDiff;
5312
    float128 z;
5313

    
5314
    aSig1 = extractFloat128Frac1( a );
5315
    aSig0 = extractFloat128Frac0( a );
5316
    aExp = extractFloat128Exp( a );
5317
    bSig1 = extractFloat128Frac1( b );
5318
    bSig0 = extractFloat128Frac0( b );
5319
    bExp = extractFloat128Exp( b );
5320
    expDiff = aExp - bExp;
5321
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5322
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5323
    if ( 0 < expDiff ) goto aExpBigger;
5324
    if ( expDiff < 0 ) goto bExpBigger;
5325
    if ( aExp == 0x7FFF ) {
5326
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5327
            return propagateFloat128NaN( a, b STATUS_VAR );
5328
        }
5329
        float_raise( float_flag_invalid STATUS_VAR);
5330
        z.low = float128_default_nan_low;
5331
        z.high = float128_default_nan_high;
5332
        return z;
5333
    }
5334
    if ( aExp == 0 ) {
5335
        aExp = 1;
5336
        bExp = 1;
5337
    }
5338
    if ( bSig0 < aSig0 ) goto aBigger;
5339
    if ( aSig0 < bSig0 ) goto bBigger;
5340
    if ( bSig1 < aSig1 ) goto aBigger;
5341
    if ( aSig1 < bSig1 ) goto bBigger;
5342
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5343
 bExpBigger:
5344
    if ( bExp == 0x7FFF ) {
5345
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5346
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5347
    }
5348
    if ( aExp == 0 ) {
5349
        ++expDiff;
5350
    }
5351
    else {
5352
        aSig0 |= LIT64( 0x4000000000000000 );
5353
    }
5354
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5355
    bSig0 |= LIT64( 0x4000000000000000 );
5356
 bBigger:
5357
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5358
    zExp = bExp;
5359
    zSign ^= 1;
5360
    goto normalizeRoundAndPack;
5361
 aExpBigger:
5362
    if ( aExp == 0x7FFF ) {
5363
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5364
        return a;
5365
    }
5366
    if ( bExp == 0 ) {
5367
        --expDiff;
5368
    }
5369
    else {
5370
        bSig0 |= LIT64( 0x4000000000000000 );
5371
    }
5372
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5373
    aSig0 |= LIT64( 0x4000000000000000 );
5374
 aBigger:
5375
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5376
    zExp = aExp;
5377
 normalizeRoundAndPack:
5378
    --zExp;
5379
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5380

    
5381
}
5382

    
5383
/*----------------------------------------------------------------------------
5384
| Returns the result of adding the quadruple-precision floating-point values
5385
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5386
| for Binary Floating-Point Arithmetic.
5387
*----------------------------------------------------------------------------*/
5388

    
5389
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5390
{
5391
    flag aSign, bSign;
5392

    
5393
    aSign = extractFloat128Sign( a );
5394
    bSign = extractFloat128Sign( b );
5395
    if ( aSign == bSign ) {
5396
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5397
    }
5398
    else {
5399
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5400
    }
5401

    
5402
}
5403

    
5404
/*----------------------------------------------------------------------------
5405
| Returns the result of subtracting the quadruple-precision floating-point
5406
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5407
| Standard for Binary Floating-Point Arithmetic.
5408
*----------------------------------------------------------------------------*/
5409

    
5410
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5411
{
5412
    flag aSign, bSign;
5413

    
5414
    aSign = extractFloat128Sign( a );
5415
    bSign = extractFloat128Sign( b );
5416
    if ( aSign == bSign ) {
5417
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5418
    }
5419
    else {
5420
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5421
    }
5422

    
5423
}
5424

    
5425
/*----------------------------------------------------------------------------
5426
| Returns the result of multiplying the quadruple-precision floating-point
5427
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5428
| Standard for Binary Floating-Point Arithmetic.
5429
*----------------------------------------------------------------------------*/
5430

    
5431
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5432
{
5433
    flag aSign, bSign, zSign;
5434
    int32 aExp, bExp, zExp;
5435
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5436
    float128 z;
5437

    
5438
    aSig1 = extractFloat128Frac1( a );
5439
    aSig0 = extractFloat128Frac0( a );
5440
    aExp = extractFloat128Exp( a );
5441
    aSign = extractFloat128Sign( a );
5442
    bSig1 = extractFloat128Frac1( b );
5443
    bSig0 = extractFloat128Frac0( b );
5444
    bExp = extractFloat128Exp( b );
5445
    bSign = extractFloat128Sign( b );
5446
    zSign = aSign ^ bSign;
5447
    if ( aExp == 0x7FFF ) {
5448
        if (    ( aSig0 | aSig1 )
5449
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5450
            return propagateFloat128NaN( a, b STATUS_VAR );
5451
        }
5452
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5453
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5454
    }
5455
    if ( bExp == 0x7FFF ) {
5456
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5457
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5458
 invalid:
5459
            float_raise( float_flag_invalid STATUS_VAR);
5460
            z.low = float128_default_nan_low;
5461
            z.high = float128_default_nan_high;
5462
            return z;
5463
        }
5464
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5465
    }
5466
    if ( aExp == 0 ) {
5467
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5468
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5469
    }
5470
    if ( bExp == 0 ) {
5471
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5472
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5473
    }
5474
    zExp = aExp + bExp - 0x4000;
5475
    aSig0 |= LIT64( 0x0001000000000000 );
5476
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5477
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5478
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5479
    zSig2 |= ( zSig3 != 0 );
5480
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5481
        shift128ExtraRightJamming(
5482
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5483
        ++zExp;
5484
    }
5485
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5486

    
5487
}
5488

    
5489
/*----------------------------------------------------------------------------
5490
| Returns the result of dividing the quadruple-precision floating-point value
5491
| `a' by the corresponding value `b'.  The operation is performed according to
5492
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493
*----------------------------------------------------------------------------*/
5494

    
5495
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5496
{
5497
    flag aSign, bSign, zSign;
5498
    int32 aExp, bExp, zExp;
5499
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5500
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5501
    float128 z;
5502

    
5503
    aSig1 = extractFloat128Frac1( a );
5504
    aSig0 = extractFloat128Frac0( a );
5505
    aExp = extractFloat128Exp( a );
5506
    aSign = extractFloat128Sign( a );
5507
    bSig1 = extractFloat128Frac1( b );
5508
    bSig0 = extractFloat128Frac0( b );
5509
    bExp = extractFloat128Exp( b );
5510
    bSign = extractFloat128Sign( b );
5511
    zSign = aSign ^ bSign;
5512
    if ( aExp == 0x7FFF ) {
5513
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5514
        if ( bExp == 0x7FFF ) {
5515
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5516
            goto invalid;
5517
        }
5518
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5519
    }
5520
    if ( bExp == 0x7FFF ) {
5521
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5522
        return packFloat128( zSign, 0, 0, 0 );
5523
    }
5524
    if ( bExp == 0 ) {
5525
        if ( ( bSig0 | bSig1 ) == 0 ) {
5526
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5527
 invalid:
5528
                float_raise( float_flag_invalid STATUS_VAR);
5529
                z.low = float128_default_nan_low;
5530
                z.high = float128_default_nan_high;
5531
                return z;
5532
            }
5533
            float_raise( float_flag_divbyzero STATUS_VAR);
5534
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5535
        }
5536
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5537
    }
5538
    if ( aExp == 0 ) {
5539
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5540
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5541
    }
5542
    zExp = aExp - bExp + 0x3FFD;
5543
    shortShift128Left(
5544
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5545
    shortShift128Left(
5546
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5547
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5548
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5549
        ++zExp;
5550
    }
5551
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5552
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5553
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5554
    while ( (int64_t) rem0 < 0 ) {
5555
        --zSig0;
5556
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5557
    }
5558
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5559
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5560
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5561
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5562
        while ( (int64_t) rem1 < 0 ) {
5563
            --zSig1;
5564
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5565
        }
5566
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5567
    }
5568
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5569
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5570

    
5571
}
5572

    
5573
/*----------------------------------------------------------------------------
5574
| Returns the remainder of the quadruple-precision floating-point value `a'
5575
| with respect to the corresponding value `b'.  The operation is performed
5576
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5577
*----------------------------------------------------------------------------*/
5578

    
5579
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5580
{
5581
    flag aSign, zSign;
5582
    int32 aExp, bExp, expDiff;
5583
    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5584
    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5585
    int64_t sigMean0;
5586
    float128 z;
5587

    
5588
    aSig1 = extractFloat128Frac1( a );
5589
    aSig0 = extractFloat128Frac0( a );
5590
    aExp = extractFloat128Exp( a );
5591
    aSign = extractFloat128Sign( a );
5592
    bSig1 = extractFloat128Frac1( b );
5593
    bSig0 = extractFloat128Frac0( b );
5594
    bExp = extractFloat128Exp( b );
5595
    if ( aExp == 0x7FFF ) {
5596
        if (    ( aSig0 | aSig1 )
5597
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5598
            return propagateFloat128NaN( a, b STATUS_VAR );
5599
        }
5600
        goto invalid;
5601
    }
5602
    if ( bExp == 0x7FFF ) {
5603
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5604
        return a;
5605
    }
5606
    if ( bExp == 0 ) {
5607
        if ( ( bSig0 | bSig1 ) == 0 ) {
5608
 invalid:
5609
            float_raise( float_flag_invalid STATUS_VAR);
5610
            z.low = float128_default_nan_low;
5611
            z.high = float128_default_nan_high;
5612
            return z;
5613
        }
5614
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5615
    }
5616
    if ( aExp == 0 ) {
5617
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5618
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5619
    }
5620
    expDiff = aExp - bExp;
5621
    if ( expDiff < -1 ) return a;
5622
    shortShift128Left(
5623
        aSig0 | LIT64( 0x0001000000000000 ),
5624
        aSig1,
5625
        15 - ( expDiff < 0 ),
5626
        &aSig0,
5627
        &aSig1
5628
    );
5629
    shortShift128Left(
5630
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5631
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5632
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5633
    expDiff -= 64;
5634
    while ( 0 < expDiff ) {
5635
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5636
        q = ( 4 < q ) ? q - 4 : 0;
5637
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5638
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5639
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5640
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5641
        expDiff -= 61;
5642
    }
5643
    if ( -64 < expDiff ) {
5644
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5645
        q = ( 4 < q ) ? q - 4 : 0;
5646
        q >>= - expDiff;
5647
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5648
        expDiff += 52;
5649
        if ( expDiff < 0 ) {
5650
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5651
        }
5652
        else {
5653
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5654
        }
5655
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5656
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5657
    }
5658
    else {
5659
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5660
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5661
    }
5662
    do {
5663
        alternateASig0 = aSig0;
5664
        alternateASig1 = aSig1;
5665
        ++q;
5666
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5667
    } while ( 0 <= (int64_t) aSig0 );
5668
    add128(
5669
        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5670
    if (    ( sigMean0 < 0 )
5671
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5672
        aSig0 = alternateASig0;
5673
        aSig1 = alternateASig1;
5674
    }
5675
    zSign = ( (int64_t) aSig0 < 0 );
5676
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5677
    return
5678
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5679

    
5680
}
5681

    
5682
/*----------------------------------------------------------------------------
5683
| Returns the square root of the quadruple-precision floating-point value `a'.
5684
| The operation is performed according to the IEC/IEEE Standard for Binary
5685
| Floating-Point Arithmetic.
5686
*----------------------------------------------------------------------------*/
5687

    
5688
float128 float128_sqrt( float128 a STATUS_PARAM )
5689
{
5690
    flag aSign;
5691
    int32 aExp, zExp;
5692
    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5693
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5694
    float128 z;
5695

    
5696
    aSig1 = extractFloat128Frac1( a );
5697
    aSig0 = extractFloat128Frac0( a );
5698
    aExp = extractFloat128Exp( a );
5699
    aSign = extractFloat128Sign( a );
5700
    if ( aExp == 0x7FFF ) {
5701
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5702
        if ( ! aSign ) return a;
5703
        goto invalid;
5704
    }
5705
    if ( aSign ) {
5706
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5707
 invalid:
5708
        float_raise( float_flag_invalid STATUS_VAR);
5709
        z.low = float128_default_nan_low;
5710
        z.high = float128_default_nan_high;
5711
        return z;
5712
    }
5713
    if ( aExp == 0 ) {
5714
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5715
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5716
    }
5717
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5718
    aSig0 |= LIT64( 0x0001000000000000 );
5719
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5720
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5721
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5722
    doubleZSig0 = zSig0<<1;
5723
    mul64To128( zSig0, zSig0, &term0, &term1 );
5724
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5725
    while ( (int64_t) rem0 < 0 ) {
5726
        --zSig0;
5727
        doubleZSig0 -= 2;
5728
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5729
    }
5730
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5731
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5732
        if ( zSig1 == 0 ) zSig1 = 1;
5733
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5734
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5735
        mul64To128( zSig1, zSig1, &term2, &term3 );
5736
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5737
        while ( (int64_t) rem1 < 0 ) {
5738
            --zSig1;
5739
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5740
            term3 |= 1;
5741
            term2 |= doubleZSig0;
5742
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5743
        }
5744
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5745
    }
5746
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5747
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5748

    
5749
}
5750

    
5751
/*----------------------------------------------------------------------------
5752
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5753
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5754
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5755
*----------------------------------------------------------------------------*/
5756

    
5757
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5758
{
5759

    
5760
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5761
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5762
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5763
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5764
       ) {
5765
        if (    float128_is_signaling_nan( a )
5766
             || float128_is_signaling_nan( b ) ) {
5767
            float_raise( float_flag_invalid STATUS_VAR);
5768
        }
5769
        return 0;
5770
    }
5771
    return
5772
           ( a.low == b.low )
5773
        && (    ( a.high == b.high )
5774
             || (    ( a.low == 0 )
5775
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5776
           );
5777

    
5778
}
5779

    
5780
/*----------------------------------------------------------------------------
5781
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5782
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5783
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5784
| Arithmetic.
5785
*----------------------------------------------------------------------------*/
5786

    
5787
int float128_le( float128 a, float128 b STATUS_PARAM )
5788
{
5789
    flag aSign, bSign;
5790

    
5791
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5792
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5793
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5794
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5795
       ) {
5796
        float_raise( float_flag_invalid STATUS_VAR);
5797
        return 0;
5798
    }
5799
    aSign = extractFloat128Sign( a );
5800
    bSign = extractFloat128Sign( b );
5801
    if ( aSign != bSign ) {
5802
        return
5803
               aSign
5804
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5805
                 == 0 );
5806
    }
5807
    return
5808
          aSign ? le128( b.high, b.low, a.high, a.low )
5809
        : le128( a.high, a.low, b.high, b.low );
5810

    
5811
}
5812

    
5813
/*----------------------------------------------------------------------------
5814
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5815
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5816
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5817
*----------------------------------------------------------------------------*/
5818

    
5819
int float128_lt( float128 a, float128 b STATUS_PARAM )
5820
{
5821
    flag aSign, bSign;
5822

    
5823
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5824
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5825
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5826
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5827
       ) {
5828
        float_raise( float_flag_invalid STATUS_VAR);
5829
        return 0;
5830
    }
5831
    aSign = extractFloat128Sign( a );
5832
    bSign = extractFloat128Sign( b );
5833
    if ( aSign != bSign ) {
5834
        return
5835
               aSign
5836
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5837
                 != 0 );
5838
    }
5839
    return
5840
          aSign ? lt128( b.high, b.low, a.high, a.low )
5841
        : lt128( a.high, a.low, b.high, b.low );
5842

    
5843
}
5844

    
5845
/*----------------------------------------------------------------------------
5846
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5847
| be compared, and 0 otherwise.  The comparison is performed according to the
5848
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5849
*----------------------------------------------------------------------------*/
5850

    
5851
int float128_unordered( float128 a, float128 b STATUS_PARAM )
5852
{
5853
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5854
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5855
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5856
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5857
       ) {
5858
        float_raise( float_flag_invalid STATUS_VAR);
5859
        return 1;
5860
    }
5861
    return 0;
5862
}
5863

    
5864
/*----------------------------------------------------------------------------
5865
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5866
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5867
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5868
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5869
*----------------------------------------------------------------------------*/
5870

    
5871
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5872
{
5873

    
5874
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5875
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5876
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5877
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5878
       ) {
5879
        float_raise( float_flag_invalid STATUS_VAR);
5880
        return 0;
5881
    }
5882
    return
5883
           ( a.low == b.low )
5884
        && (    ( a.high == b.high )
5885
             || (    ( a.low == 0 )
5886
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5887
           );
5888

    
5889
}
5890

    
5891
/*----------------------------------------------------------------------------
5892
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5893
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5894
| cause an exception.  Otherwise, the comparison is performed according to the
5895
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5896
*----------------------------------------------------------------------------*/
5897

    
5898
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5899
{
5900
    flag aSign, bSign;
5901

    
5902
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5903
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5904
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5905
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5906
       ) {
5907
        if (    float128_is_signaling_nan( a )
5908
             || float128_is_signaling_nan( b ) ) {
5909
            float_raise( float_flag_invalid STATUS_VAR);
5910
        }
5911
        return 0;
5912
    }
5913
    aSign = extractFloat128Sign( a );
5914
    bSign = extractFloat128Sign( b );
5915
    if ( aSign != bSign ) {
5916
        return
5917
               aSign
5918
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5919
                 == 0 );
5920
    }
5921
    return
5922
          aSign ? le128( b.high, b.low, a.high, a.low )
5923
        : le128( a.high, a.low, b.high, b.low );
5924

    
5925
}
5926

    
5927
/*----------------------------------------------------------------------------
5928
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5929
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5930
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5931
| Standard for Binary Floating-Point Arithmetic.
5932
*----------------------------------------------------------------------------*/
5933

    
5934
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5935
{
5936
    flag aSign, bSign;
5937

    
5938
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5939
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5940
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5941
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5942
       ) {
5943
        if (    float128_is_signaling_nan( a )
5944
             || float128_is_signaling_nan( b ) ) {
5945
            float_raise( float_flag_invalid STATUS_VAR);
5946
        }
5947
        return 0;
5948
    }
5949
    aSign = extractFloat128Sign( a );
5950
    bSign = extractFloat128Sign( b );
5951
    if ( aSign != bSign ) {
5952
        return
5953
               aSign
5954
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5955
                 != 0 );
5956
    }
5957
    return
5958
          aSign ? lt128( b.high, b.low, a.high, a.low )
5959
        : lt128( a.high, a.low, b.high, b.low );
5960

    
5961
}
5962

    
5963
/*----------------------------------------------------------------------------
5964
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5965
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5966
| comparison is performed according to the IEC/IEEE Standard for Binary
5967
| Floating-Point Arithmetic.
5968
*----------------------------------------------------------------------------*/
5969

    
5970
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5971
{
5972
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5973
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5974
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5975
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5976
       ) {
5977
        if (    float128_is_signaling_nan( a )
5978
             || float128_is_signaling_nan( b ) ) {
5979
            float_raise( float_flag_invalid STATUS_VAR);
5980
        }
5981
        return 1;
5982
    }
5983
    return 0;
5984
}
5985

    
5986
#endif
5987

    
5988
/* misc functions */
5989
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5990
{
5991
    return int64_to_float32(a STATUS_VAR);
5992
}
5993

    
5994
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5995
{
5996
    return int64_to_float64(a STATUS_VAR);
5997
}
5998

    
5999
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6000
{
6001
    int64_t v;
6002
    unsigned int res;
6003

    
6004
    v = float32_to_int64(a STATUS_VAR);
6005
    if (v < 0) {
6006
        res = 0;
6007
        float_raise( float_flag_invalid STATUS_VAR);
6008
    } else if (v > 0xffffffff) {
6009
        res = 0xffffffff;
6010
        float_raise( float_flag_invalid STATUS_VAR);
6011
    } else {
6012
        res = v;
6013
    }
6014
    return res;
6015
}
6016

    
6017
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6018
{
6019
    int64_t v;
6020
    unsigned int res;
6021

    
6022
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6023
    if (v < 0) {
6024
        res = 0;
6025
        float_raise( float_flag_invalid STATUS_VAR);
6026
    } else if (v > 0xffffffff) {
6027
        res = 0xffffffff;
6028
        float_raise( float_flag_invalid STATUS_VAR);
6029
    } else {
6030
        res = v;
6031
    }
6032
    return res;
6033
}
6034

    
6035
unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6036
{
6037
    int64_t v;
6038
    unsigned int res;
6039

    
6040
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6041
    if (v < 0) {
6042
        res = 0;
6043
        float_raise( float_flag_invalid STATUS_VAR);
6044
    } else if (v > 0xffff) {
6045
        res = 0xffff;
6046
        float_raise( float_flag_invalid STATUS_VAR);
6047
    } else {
6048
        res = v;
6049
    }
6050
    return res;
6051
}
6052

    
6053
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6054
{
6055
    int64_t v;
6056
    unsigned int res;
6057

    
6058
    v = float64_to_int64(a STATUS_VAR);
6059
    if (v < 0) {
6060
        res = 0;
6061
        float_raise( float_flag_invalid STATUS_VAR);
6062
    } else if (v > 0xffffffff) {
6063
        res = 0xffffffff;
6064
        float_raise( float_flag_invalid STATUS_VAR);
6065
    } else {
6066
        res = v;
6067
    }
6068
    return res;
6069
}
6070

    
6071
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6072
{
6073
    int64_t v;
6074
    unsigned int res;
6075

    
6076
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6077
    if (v < 0) {
6078
        res = 0;
6079
        float_raise( float_flag_invalid STATUS_VAR);
6080
    } else if (v > 0xffffffff) {
6081
        res = 0xffffffff;
6082
        float_raise( float_flag_invalid STATUS_VAR);
6083
    } else {
6084
        res = v;
6085
    }
6086
    return res;
6087
}
6088

    
6089
unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6090
{
6091
    int64_t v;
6092
    unsigned int res;
6093

    
6094
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6095
    if (v < 0) {
6096
        res = 0;
6097
        float_raise( float_flag_invalid STATUS_VAR);
6098
    } else if (v > 0xffff) {
6099
        res = 0xffff;
6100
        float_raise( float_flag_invalid STATUS_VAR);
6101
    } else {
6102
        res = v;
6103
    }
6104
    return res;
6105
}
6106

    
6107
/* FIXME: This looks broken.  */
6108
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6109
{
6110
    int64_t v;
6111

    
6112
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6113
    v += float64_val(a);
6114
    v = float64_to_int64(make_float64(v) STATUS_VAR);
6115

    
6116
    return v - INT64_MIN;
6117
}
6118

    
6119
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6120
{
6121
    int64_t v;
6122

    
6123
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6124
    v += float64_val(a);
6125
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6126

    
6127
    return v - INT64_MIN;
6128
}
6129

    
6130
#define COMPARE(s, nan_exp)                                                  \
6131
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6132
                                      int is_quiet STATUS_PARAM )            \
6133
{                                                                            \
6134
    flag aSign, bSign;                                                       \
6135
    uint ## s ## _t av, bv;                                                  \
6136
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6137
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6138
                                                                             \
6139
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6140
         extractFloat ## s ## Frac( a ) ) ||                                 \
6141
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6142
          extractFloat ## s ## Frac( b ) )) {                                \
6143
        if (!is_quiet ||                                                     \
6144
            float ## s ## _is_signaling_nan( a ) ||                          \
6145
            float ## s ## _is_signaling_nan( b ) ) {                         \
6146
            float_raise( float_flag_invalid STATUS_VAR);                     \
6147
        }                                                                    \
6148
        return float_relation_unordered;                                     \
6149
    }                                                                        \
6150
    aSign = extractFloat ## s ## Sign( a );                                  \
6151
    bSign = extractFloat ## s ## Sign( b );                                  \
6152
    av = float ## s ## _val(a);                                              \
6153
    bv = float ## s ## _val(b);                                              \
6154
    if ( aSign != bSign ) {                                                  \
6155
        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6156
            /* zero case */                                                  \
6157
            return float_relation_equal;                                     \
6158
        } else {                                                             \
6159
            return 1 - (2 * aSign);                                          \
6160
        }                                                                    \
6161
    } else {                                                                 \
6162
        if (av == bv) {                                                      \
6163
            return float_relation_equal;                                     \
6164
        } else {                                                             \
6165
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
6166
        }                                                                    \
6167
    }                                                                        \
6168
}                                                                            \
6169
                                                                             \
6170
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6171
{                                                                            \
6172
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6173
}                                                                            \
6174
                                                                             \
6175
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6176
{                                                                            \
6177
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6178
}
6179

    
6180
COMPARE(32, 0xff)
6181
COMPARE(64, 0x7ff)
6182

    
6183
INLINE int float128_compare_internal( float128 a, float128 b,
6184
                                      int is_quiet STATUS_PARAM )
6185
{
6186
    flag aSign, bSign;
6187

    
6188
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6189
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6190
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6191
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6192
        if (!is_quiet ||
6193
            float128_is_signaling_nan( a ) ||
6194
            float128_is_signaling_nan( b ) ) {
6195
            float_raise( float_flag_invalid STATUS_VAR);
6196
        }
6197
        return float_relation_unordered;
6198
    }
6199
    aSign = extractFloat128Sign( a );
6200
    bSign = extractFloat128Sign( b );
6201
    if ( aSign != bSign ) {
6202
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6203
            /* zero case */
6204
            return float_relation_equal;
6205
        } else {
6206
            return 1 - (2 * aSign);
6207
        }
6208
    } else {
6209
        if (a.low == b.low && a.high == b.high) {
6210
            return float_relation_equal;
6211
        } else {
6212
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6213
        }
6214
    }
6215
}
6216

    
6217
int float128_compare( float128 a, float128 b STATUS_PARAM )
6218
{
6219
    return float128_compare_internal(a, b, 0 STATUS_VAR);
6220
}
6221

    
6222
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6223
{
6224
    return float128_compare_internal(a, b, 1 STATUS_VAR);
6225
}
6226

    
6227
/* min() and max() functions. These can't be implemented as
6228
 * 'compare and pick one input' because that would mishandle
6229
 * NaNs and +0 vs -0.
6230
 */
6231
#define MINMAX(s, nan_exp)                                              \
6232
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6233
                                        int ismin STATUS_PARAM )        \
6234
{                                                                       \
6235
    flag aSign, bSign;                                                  \
6236
    uint ## s ## _t av, bv;                                             \
6237
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6238
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6239
    if (float ## s ## _is_any_nan(a) ||                                 \
6240
        float ## s ## _is_any_nan(b)) {                                 \
6241
        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6242
    }                                                                   \
6243
    aSign = extractFloat ## s ## Sign(a);                               \
6244
    bSign = extractFloat ## s ## Sign(b);                               \
6245
    av = float ## s ## _val(a);                                         \
6246
    bv = float ## s ## _val(b);                                         \
6247
    if (aSign != bSign) {                                               \
6248
        if (ismin) {                                                    \
6249
            return aSign ? a : b;                                       \
6250
        } else {                                                        \
6251
            return aSign ? b : a;                                       \
6252
        }                                                               \
6253
    } else {                                                            \
6254
        if (ismin) {                                                    \
6255
            return (aSign ^ (av < bv)) ? a : b;                         \
6256
        } else {                                                        \
6257
            return (aSign ^ (av < bv)) ? b : a;                         \
6258
        }                                                               \
6259
    }                                                                   \
6260
}                                                                       \
6261
                                                                        \
6262
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6263
{                                                                       \
6264
    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6265
}                                                                       \
6266
                                                                        \
6267
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6268
{                                                                       \
6269
    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6270
}
6271

    
6272
MINMAX(32, 0xff)
6273
MINMAX(64, 0x7ff)
6274

    
6275

    
6276
/* Multiply A by 2 raised to the power N.  */
6277
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6278
{
6279
    flag aSign;
6280
    int16 aExp;
6281
    uint32_t aSig;
6282

    
6283
    a = float32_squash_input_denormal(a STATUS_VAR);
6284
    aSig = extractFloat32Frac( a );
6285
    aExp = extractFloat32Exp( a );
6286
    aSign = extractFloat32Sign( a );
6287

    
6288
    if ( aExp == 0xFF ) {
6289
        return a;
6290
    }
6291
    if ( aExp != 0 )
6292
        aSig |= 0x00800000;
6293
    else if ( aSig == 0 )
6294
        return a;
6295

    
6296
    aExp += n - 1;
6297
    aSig <<= 7;
6298
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6299
}
6300

    
6301
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6302
{
6303
    flag aSign;
6304
    int16 aExp;
6305
    uint64_t aSig;
6306

    
6307
    a = float64_squash_input_denormal(a STATUS_VAR);
6308
    aSig = extractFloat64Frac( a );
6309
    aExp = extractFloat64Exp( a );
6310
    aSign = extractFloat64Sign( a );
6311

    
6312
    if ( aExp == 0x7FF ) {
6313
        return a;
6314
    }
6315
    if ( aExp != 0 )
6316
        aSig |= LIT64( 0x0010000000000000 );
6317
    else if ( aSig == 0 )
6318
        return a;
6319

    
6320
    aExp += n - 1;
6321
    aSig <<= 10;
6322
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6323
}
6324

    
6325
#ifdef FLOATX80
6326
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6327
{
6328
    flag aSign;
6329
    int16 aExp;
6330
    uint64_t aSig;
6331

    
6332
    aSig = extractFloatx80Frac( a );
6333
    aExp = extractFloatx80Exp( a );
6334
    aSign = extractFloatx80Sign( a );
6335

    
6336
    if ( aExp == 0x7FF ) {
6337
        return a;
6338
    }
6339
    if (aExp == 0 && aSig == 0)
6340
        return a;
6341

    
6342
    aExp += n;
6343
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6344
                                          aSign, aExp, aSig, 0 STATUS_VAR );
6345
}
6346
#endif
6347

    
6348
#ifdef FLOAT128
6349
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6350
{
6351
    flag aSign;
6352
    int32 aExp;
6353
    uint64_t aSig0, aSig1;
6354

    
6355
    aSig1 = extractFloat128Frac1( a );
6356
    aSig0 = extractFloat128Frac0( a );
6357
    aExp = extractFloat128Exp( a );
6358
    aSign = extractFloat128Sign( a );
6359
    if ( aExp == 0x7FFF ) {
6360
        return a;
6361
    }
6362
    if ( aExp != 0 )
6363
        aSig0 |= LIT64( 0x0001000000000000 );
6364
    else if ( aSig0 == 0 && aSig1 == 0 )
6365
        return a;
6366

    
6367
    aExp += n - 1;
6368
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6369
                                          STATUS_VAR );
6370

    
6371
}
6372
#endif