Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 6df658f5

History | View | Annotate | Download (228.5 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
#include "softfloat.h"
39

    
40
/*----------------------------------------------------------------------------
41
| Primitive arithmetic functions, including multi-word arithmetic, and
42
| division and square root approximations.  (Can be specialized to target if
43
| desired.)
44
*----------------------------------------------------------------------------*/
45
#include "softfloat-macros.h"
46

    
47
/*----------------------------------------------------------------------------
48
| Functions and definitions to determine:  (1) whether tininess for underflow
49
| is detected before or after rounding by default, (2) what (if anything)
50
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
51
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52
| are propagated from function inputs to output.  These details are target-
53
| specific.
54
*----------------------------------------------------------------------------*/
55
#include "softfloat-specialize.h"
56

    
57
void set_float_rounding_mode(int val STATUS_PARAM)
58
{
59
    STATUS(float_rounding_mode) = val;
60
}
61

    
62
void set_float_exception_flags(int val STATUS_PARAM)
63
{
64
    STATUS(float_exception_flags) = val;
65
}
66

    
67
void set_floatx80_rounding_precision(int val STATUS_PARAM)
68
{
69
    STATUS(floatx80_rounding_precision) = val;
70
}
71

    
72
/*----------------------------------------------------------------------------
73
| Returns the fraction bits of the half-precision floating-point value `a'.
74
*----------------------------------------------------------------------------*/
75

    
76
INLINE uint32_t extractFloat16Frac(float16 a)
77
{
78
    return float16_val(a) & 0x3ff;
79
}
80

    
81
/*----------------------------------------------------------------------------
82
| Returns the exponent bits of the half-precision floating-point value `a'.
83
*----------------------------------------------------------------------------*/
84

    
85
INLINE int16 extractFloat16Exp(float16 a)
86
{
87
    return (float16_val(a) >> 10) & 0x1f;
88
}
89

    
90
/*----------------------------------------------------------------------------
91
| Returns the sign bit of the single-precision floating-point value `a'.
92
*----------------------------------------------------------------------------*/
93

    
94
INLINE flag extractFloat16Sign(float16 a)
95
{
96
    return float16_val(a)>>15;
97
}
98

    
99
/*----------------------------------------------------------------------------
100
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
101
| and 7, and returns the properly rounded 32-bit integer corresponding to the
102
| input.  If `zSign' is 1, the input is negated before being converted to an
103
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
104
| is simply rounded to an integer, with the inexact exception raised if the
105
| input cannot be represented exactly as an integer.  However, if the fixed-
106
| point input is too large, the invalid exception is raised and the largest
107
| positive or negative integer is returned.
108
*----------------------------------------------------------------------------*/
109

    
110
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
111
{
112
    int8 roundingMode;
113
    flag roundNearestEven;
114
    int8 roundIncrement, roundBits;
115
    int32 z;
116

    
117
    roundingMode = STATUS(float_rounding_mode);
118
    roundNearestEven = ( roundingMode == float_round_nearest_even );
119
    roundIncrement = 0x40;
120
    if ( ! roundNearestEven ) {
121
        if ( roundingMode == float_round_to_zero ) {
122
            roundIncrement = 0;
123
        }
124
        else {
125
            roundIncrement = 0x7F;
126
            if ( zSign ) {
127
                if ( roundingMode == float_round_up ) roundIncrement = 0;
128
            }
129
            else {
130
                if ( roundingMode == float_round_down ) roundIncrement = 0;
131
            }
132
        }
133
    }
134
    roundBits = absZ & 0x7F;
135
    absZ = ( absZ + roundIncrement )>>7;
136
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
137
    z = absZ;
138
    if ( zSign ) z = - z;
139
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
140
        float_raise( float_flag_invalid STATUS_VAR);
141
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
142
    }
143
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
144
    return z;
145

    
146
}
147

    
148
/*----------------------------------------------------------------------------
149
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
150
| `absZ1', with binary point between bits 63 and 64 (between the input words),
151
| and returns the properly rounded 64-bit integer corresponding to the input.
152
| If `zSign' is 1, the input is negated before being converted to an integer.
153
| Ordinarily, the fixed-point input is simply rounded to an integer, with
154
| the inexact exception raised if the input cannot be represented exactly as
155
| an integer.  However, if the fixed-point input is too large, the invalid
156
| exception is raised and the largest positive or negative integer is
157
| returned.
158
*----------------------------------------------------------------------------*/
159

    
160
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
161
{
162
    int8 roundingMode;
163
    flag roundNearestEven, increment;
164
    int64 z;
165

    
166
    roundingMode = STATUS(float_rounding_mode);
167
    roundNearestEven = ( roundingMode == float_round_nearest_even );
168
    increment = ( (int64_t) absZ1 < 0 );
169
    if ( ! roundNearestEven ) {
170
        if ( roundingMode == float_round_to_zero ) {
171
            increment = 0;
172
        }
173
        else {
174
            if ( zSign ) {
175
                increment = ( roundingMode == float_round_down ) && absZ1;
176
            }
177
            else {
178
                increment = ( roundingMode == float_round_up ) && absZ1;
179
            }
180
        }
181
    }
182
    if ( increment ) {
183
        ++absZ0;
184
        if ( absZ0 == 0 ) goto overflow;
185
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
186
    }
187
    z = absZ0;
188
    if ( zSign ) z = - z;
189
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
190
 overflow:
191
        float_raise( float_flag_invalid STATUS_VAR);
192
        return
193
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
194
            : LIT64( 0x7FFFFFFFFFFFFFFF );
195
    }
196
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
197
    return z;
198

    
199
}
200

    
201
/*----------------------------------------------------------------------------
202
| Returns the fraction bits of the single-precision floating-point value `a'.
203
*----------------------------------------------------------------------------*/
204

    
205
INLINE uint32_t extractFloat32Frac( float32 a )
206
{
207

    
208
    return float32_val(a) & 0x007FFFFF;
209

    
210
}
211

    
212
/*----------------------------------------------------------------------------
213
| Returns the exponent bits of the single-precision floating-point value `a'.
214
*----------------------------------------------------------------------------*/
215

    
216
INLINE int16 extractFloat32Exp( float32 a )
217
{
218

    
219
    return ( float32_val(a)>>23 ) & 0xFF;
220

    
221
}
222

    
223
/*----------------------------------------------------------------------------
224
| Returns the sign bit of the single-precision floating-point value `a'.
225
*----------------------------------------------------------------------------*/
226

    
227
INLINE flag extractFloat32Sign( float32 a )
228
{
229

    
230
    return float32_val(a)>>31;
231

    
232
}
233

    
234
/*----------------------------------------------------------------------------
235
| If `a' is denormal and we are in flush-to-zero mode then set the
236
| input-denormal exception and return zero. Otherwise just return the value.
237
*----------------------------------------------------------------------------*/
238
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
239
{
240
    if (STATUS(flush_inputs_to_zero)) {
241
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
242
            float_raise(float_flag_input_denormal STATUS_VAR);
243
            return make_float32(float32_val(a) & 0x80000000);
244
        }
245
    }
246
    return a;
247
}
248

    
249
/*----------------------------------------------------------------------------
250
| Normalizes the subnormal single-precision floating-point value represented
251
| by the denormalized significand `aSig'.  The normalized exponent and
252
| significand are stored at the locations pointed to by `zExpPtr' and
253
| `zSigPtr', respectively.
254
*----------------------------------------------------------------------------*/
255

    
256
static void
257
 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
258
{
259
    int8 shiftCount;
260

    
261
    shiftCount = countLeadingZeros32( aSig ) - 8;
262
    *zSigPtr = aSig<<shiftCount;
263
    *zExpPtr = 1 - shiftCount;
264

    
265
}
266

    
267
/*----------------------------------------------------------------------------
268
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
269
| single-precision floating-point value, returning the result.  After being
270
| shifted into the proper positions, the three fields are simply added
271
| together to form the result.  This means that any integer portion of `zSig'
272
| will be added into the exponent.  Since a properly normalized significand
273
| will have an integer portion equal to 1, the `zExp' input should be 1 less
274
| than the desired result exponent whenever `zSig' is a complete, normalized
275
| significand.
276
*----------------------------------------------------------------------------*/
277

    
278
INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
279
{
280

    
281
    return make_float32(
282
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
283

    
284
}
285

    
286
/*----------------------------------------------------------------------------
287
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
288
| and significand `zSig', and returns the proper single-precision floating-
289
| point value corresponding to the abstract input.  Ordinarily, the abstract
290
| value is simply rounded and packed into the single-precision format, with
291
| the inexact exception raised if the abstract input cannot be represented
292
| exactly.  However, if the abstract value is too large, the overflow and
293
| inexact exceptions are raised and an infinity or maximal finite value is
294
| returned.  If the abstract value is too small, the input value is rounded to
295
| a subnormal number, and the underflow and inexact exceptions are raised if
296
| the abstract input cannot be represented exactly as a subnormal single-
297
| precision floating-point number.
298
|     The input significand `zSig' has its binary point between bits 30
299
| and 29, which is 7 bits to the left of the usual location.  This shifted
300
| significand must be normalized or smaller.  If `zSig' is not normalized,
301
| `zExp' must be 0; in that case, the result returned is a subnormal number,
302
| and it must not require rounding.  In the usual case that `zSig' is
303
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
304
| The handling of underflow and overflow follows the IEC/IEEE Standard for
305
| Binary Floating-Point Arithmetic.
306
*----------------------------------------------------------------------------*/
307

    
308
static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
309
{
310
    int8 roundingMode;
311
    flag roundNearestEven;
312
    int8 roundIncrement, roundBits;
313
    flag isTiny;
314

    
315
    roundingMode = STATUS(float_rounding_mode);
316
    roundNearestEven = ( roundingMode == float_round_nearest_even );
317
    roundIncrement = 0x40;
318
    if ( ! roundNearestEven ) {
319
        if ( roundingMode == float_round_to_zero ) {
320
            roundIncrement = 0;
321
        }
322
        else {
323
            roundIncrement = 0x7F;
324
            if ( zSign ) {
325
                if ( roundingMode == float_round_up ) roundIncrement = 0;
326
            }
327
            else {
328
                if ( roundingMode == float_round_down ) roundIncrement = 0;
329
            }
330
        }
331
    }
332
    roundBits = zSig & 0x7F;
333
    if ( 0xFD <= (uint16_t) zExp ) {
334
        if (    ( 0xFD < zExp )
335
             || (    ( zExp == 0xFD )
336
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
337
           ) {
338
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
339
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
340
        }
341
        if ( zExp < 0 ) {
342
            if (STATUS(flush_to_zero)) {
343
                float_raise(float_flag_output_denormal STATUS_VAR);
344
                return packFloat32(zSign, 0, 0);
345
            }
346
            isTiny =
347
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
348
                || ( zExp < -1 )
349
                || ( zSig + roundIncrement < 0x80000000 );
350
            shift32RightJamming( zSig, - zExp, &zSig );
351
            zExp = 0;
352
            roundBits = zSig & 0x7F;
353
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
354
        }
355
    }
356
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
357
    zSig = ( zSig + roundIncrement )>>7;
358
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
359
    if ( zSig == 0 ) zExp = 0;
360
    return packFloat32( zSign, zExp, zSig );
361

    
362
}
363

    
364
/*----------------------------------------------------------------------------
365
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
366
| and significand `zSig', and returns the proper single-precision floating-
367
| point value corresponding to the abstract input.  This routine is just like
368
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
369
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
370
| floating-point exponent.
371
*----------------------------------------------------------------------------*/
372

    
373
static float32
374
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
375
{
376
    int8 shiftCount;
377

    
378
    shiftCount = countLeadingZeros32( zSig ) - 1;
379
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
380

    
381
}
382

    
383
/*----------------------------------------------------------------------------
384
| Returns the fraction bits of the double-precision floating-point value `a'.
385
*----------------------------------------------------------------------------*/
386

    
387
INLINE uint64_t extractFloat64Frac( float64 a )
388
{
389

    
390
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
391

    
392
}
393

    
394
/*----------------------------------------------------------------------------
395
| Returns the exponent bits of the double-precision floating-point value `a'.
396
*----------------------------------------------------------------------------*/
397

    
398
INLINE int16 extractFloat64Exp( float64 a )
399
{
400

    
401
    return ( float64_val(a)>>52 ) & 0x7FF;
402

    
403
}
404

    
405
/*----------------------------------------------------------------------------
406
| Returns the sign bit of the double-precision floating-point value `a'.
407
*----------------------------------------------------------------------------*/
408

    
409
INLINE flag extractFloat64Sign( float64 a )
410
{
411

    
412
    return float64_val(a)>>63;
413

    
414
}
415

    
416
/*----------------------------------------------------------------------------
417
| If `a' is denormal and we are in flush-to-zero mode then set the
418
| input-denormal exception and return zero. Otherwise just return the value.
419
*----------------------------------------------------------------------------*/
420
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
421
{
422
    if (STATUS(flush_inputs_to_zero)) {
423
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
424
            float_raise(float_flag_input_denormal STATUS_VAR);
425
            return make_float64(float64_val(a) & (1ULL << 63));
426
        }
427
    }
428
    return a;
429
}
430

    
431
/*----------------------------------------------------------------------------
432
| Normalizes the subnormal double-precision floating-point value represented
433
| by the denormalized significand `aSig'.  The normalized exponent and
434
| significand are stored at the locations pointed to by `zExpPtr' and
435
| `zSigPtr', respectively.
436
*----------------------------------------------------------------------------*/
437

    
438
static void
439
 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
440
{
441
    int8 shiftCount;
442

    
443
    shiftCount = countLeadingZeros64( aSig ) - 11;
444
    *zSigPtr = aSig<<shiftCount;
445
    *zExpPtr = 1 - shiftCount;
446

    
447
}
448

    
449
/*----------------------------------------------------------------------------
450
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
451
| double-precision floating-point value, returning the result.  After being
452
| shifted into the proper positions, the three fields are simply added
453
| together to form the result.  This means that any integer portion of `zSig'
454
| will be added into the exponent.  Since a properly normalized significand
455
| will have an integer portion equal to 1, the `zExp' input should be 1 less
456
| than the desired result exponent whenever `zSig' is a complete, normalized
457
| significand.
458
*----------------------------------------------------------------------------*/
459

    
460
INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
461
{
462

    
463
    return make_float64(
464
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
465

    
466
}
467

    
468
/*----------------------------------------------------------------------------
469
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
470
| and significand `zSig', and returns the proper double-precision floating-
471
| point value corresponding to the abstract input.  Ordinarily, the abstract
472
| value is simply rounded and packed into the double-precision format, with
473
| the inexact exception raised if the abstract input cannot be represented
474
| exactly.  However, if the abstract value is too large, the overflow and
475
| inexact exceptions are raised and an infinity or maximal finite value is
476
| returned.  If the abstract value is too small, the input value is rounded
477
| to a subnormal number, and the underflow and inexact exceptions are raised
478
| if the abstract input cannot be represented exactly as a subnormal double-
479
| precision floating-point number.
480
|     The input significand `zSig' has its binary point between bits 62
481
| and 61, which is 10 bits to the left of the usual location.  This shifted
482
| significand must be normalized or smaller.  If `zSig' is not normalized,
483
| `zExp' must be 0; in that case, the result returned is a subnormal number,
484
| and it must not require rounding.  In the usual case that `zSig' is
485
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
486
| The handling of underflow and overflow follows the IEC/IEEE Standard for
487
| Binary Floating-Point Arithmetic.
488
*----------------------------------------------------------------------------*/
489

    
490
static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
491
{
492
    int8 roundingMode;
493
    flag roundNearestEven;
494
    int16 roundIncrement, roundBits;
495
    flag isTiny;
496

    
497
    roundingMode = STATUS(float_rounding_mode);
498
    roundNearestEven = ( roundingMode == float_round_nearest_even );
499
    roundIncrement = 0x200;
500
    if ( ! roundNearestEven ) {
501
        if ( roundingMode == float_round_to_zero ) {
502
            roundIncrement = 0;
503
        }
504
        else {
505
            roundIncrement = 0x3FF;
506
            if ( zSign ) {
507
                if ( roundingMode == float_round_up ) roundIncrement = 0;
508
            }
509
            else {
510
                if ( roundingMode == float_round_down ) roundIncrement = 0;
511
            }
512
        }
513
    }
514
    roundBits = zSig & 0x3FF;
515
    if ( 0x7FD <= (uint16_t) zExp ) {
516
        if (    ( 0x7FD < zExp )
517
             || (    ( zExp == 0x7FD )
518
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
519
           ) {
520
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
521
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
522
        }
523
        if ( zExp < 0 ) {
524
            if (STATUS(flush_to_zero)) {
525
                float_raise(float_flag_output_denormal STATUS_VAR);
526
                return packFloat64(zSign, 0, 0);
527
            }
528
            isTiny =
529
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
530
                || ( zExp < -1 )
531
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
532
            shift64RightJamming( zSig, - zExp, &zSig );
533
            zExp = 0;
534
            roundBits = zSig & 0x3FF;
535
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
536
        }
537
    }
538
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
539
    zSig = ( zSig + roundIncrement )>>10;
540
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
541
    if ( zSig == 0 ) zExp = 0;
542
    return packFloat64( zSign, zExp, zSig );
543

    
544
}
545

    
546
/*----------------------------------------------------------------------------
547
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
548
| and significand `zSig', and returns the proper double-precision floating-
549
| point value corresponding to the abstract input.  This routine is just like
550
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
551
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
552
| floating-point exponent.
553
*----------------------------------------------------------------------------*/
554

    
555
static float64
556
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
557
{
558
    int8 shiftCount;
559

    
560
    shiftCount = countLeadingZeros64( zSig ) - 1;
561
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
562

    
563
}
564

    
565
/*----------------------------------------------------------------------------
566
| Returns the fraction bits of the extended double-precision floating-point
567
| value `a'.
568
*----------------------------------------------------------------------------*/
569

    
570
INLINE uint64_t extractFloatx80Frac( floatx80 a )
571
{
572

    
573
    return a.low;
574

    
575
}
576

    
577
/*----------------------------------------------------------------------------
578
| Returns the exponent bits of the extended double-precision floating-point
579
| value `a'.
580
*----------------------------------------------------------------------------*/
581

    
582
INLINE int32 extractFloatx80Exp( floatx80 a )
583
{
584

    
585
    return a.high & 0x7FFF;
586

    
587
}
588

    
589
/*----------------------------------------------------------------------------
590
| Returns the sign bit of the extended double-precision floating-point value
591
| `a'.
592
*----------------------------------------------------------------------------*/
593

    
594
INLINE flag extractFloatx80Sign( floatx80 a )
595
{
596

    
597
    return a.high>>15;
598

    
599
}
600

    
601
/*----------------------------------------------------------------------------
602
| Normalizes the subnormal extended double-precision floating-point value
603
| represented by the denormalized significand `aSig'.  The normalized exponent
604
| and significand are stored at the locations pointed to by `zExpPtr' and
605
| `zSigPtr', respectively.
606
*----------------------------------------------------------------------------*/
607

    
608
static void
609
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
610
{
611
    int8 shiftCount;
612

    
613
    shiftCount = countLeadingZeros64( aSig );
614
    *zSigPtr = aSig<<shiftCount;
615
    *zExpPtr = 1 - shiftCount;
616

    
617
}
618

    
619
/*----------------------------------------------------------------------------
620
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
621
| extended double-precision floating-point value, returning the result.
622
*----------------------------------------------------------------------------*/
623

    
624
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
625
{
626
    floatx80 z;
627

    
628
    z.low = zSig;
629
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
630
    return z;
631

    
632
}
633

    
634
/*----------------------------------------------------------------------------
635
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
637
| and returns the proper extended double-precision floating-point value
638
| corresponding to the abstract input.  Ordinarily, the abstract value is
639
| rounded and packed into the extended double-precision format, with the
640
| inexact exception raised if the abstract input cannot be represented
641
| exactly.  However, if the abstract value is too large, the overflow and
642
| inexact exceptions are raised and an infinity or maximal finite value is
643
| returned.  If the abstract value is too small, the input value is rounded to
644
| a subnormal number, and the underflow and inexact exceptions are raised if
645
| the abstract input cannot be represented exactly as a subnormal extended
646
| double-precision floating-point number.
647
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
648
| number of bits as single or double precision, respectively.  Otherwise, the
649
| result is rounded to the full precision of the extended double-precision
650
| format.
651
|     The input significand must be normalized or smaller.  If the input
652
| significand is not normalized, `zExp' must be 0; in that case, the result
653
| returned is a subnormal number, and it must not require rounding.  The
654
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655
| Floating-Point Arithmetic.
656
*----------------------------------------------------------------------------*/
657

    
658
static floatx80
659
 roundAndPackFloatx80(
660
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
661
 STATUS_PARAM)
662
{
663
    int8 roundingMode;
664
    flag roundNearestEven, increment, isTiny;
665
    int64 roundIncrement, roundMask, roundBits;
666

    
667
    roundingMode = STATUS(float_rounding_mode);
668
    roundNearestEven = ( roundingMode == float_round_nearest_even );
669
    if ( roundingPrecision == 80 ) goto precision80;
670
    if ( roundingPrecision == 64 ) {
671
        roundIncrement = LIT64( 0x0000000000000400 );
672
        roundMask = LIT64( 0x00000000000007FF );
673
    }
674
    else if ( roundingPrecision == 32 ) {
675
        roundIncrement = LIT64( 0x0000008000000000 );
676
        roundMask = LIT64( 0x000000FFFFFFFFFF );
677
    }
678
    else {
679
        goto precision80;
680
    }
681
    zSig0 |= ( zSig1 != 0 );
682
    if ( ! roundNearestEven ) {
683
        if ( roundingMode == float_round_to_zero ) {
684
            roundIncrement = 0;
685
        }
686
        else {
687
            roundIncrement = roundMask;
688
            if ( zSign ) {
689
                if ( roundingMode == float_round_up ) roundIncrement = 0;
690
            }
691
            else {
692
                if ( roundingMode == float_round_down ) roundIncrement = 0;
693
            }
694
        }
695
    }
696
    roundBits = zSig0 & roundMask;
697
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
698
        if (    ( 0x7FFE < zExp )
699
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
700
           ) {
701
            goto overflow;
702
        }
703
        if ( zExp <= 0 ) {
704
            if (STATUS(flush_to_zero)) {
705
                float_raise(float_flag_output_denormal STATUS_VAR);
706
                return packFloatx80(zSign, 0, 0);
707
            }
708
            isTiny =
709
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
710
                || ( zExp < 0 )
711
                || ( zSig0 <= zSig0 + roundIncrement );
712
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
713
            zExp = 0;
714
            roundBits = zSig0 & roundMask;
715
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
716
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
717
            zSig0 += roundIncrement;
718
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
719
            roundIncrement = roundMask + 1;
720
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
721
                roundMask |= roundIncrement;
722
            }
723
            zSig0 &= ~ roundMask;
724
            return packFloatx80( zSign, zExp, zSig0 );
725
        }
726
    }
727
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
728
    zSig0 += roundIncrement;
729
    if ( zSig0 < roundIncrement ) {
730
        ++zExp;
731
        zSig0 = LIT64( 0x8000000000000000 );
732
    }
733
    roundIncrement = roundMask + 1;
734
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
735
        roundMask |= roundIncrement;
736
    }
737
    zSig0 &= ~ roundMask;
738
    if ( zSig0 == 0 ) zExp = 0;
739
    return packFloatx80( zSign, zExp, zSig0 );
740
 precision80:
741
    increment = ( (int64_t) zSig1 < 0 );
742
    if ( ! roundNearestEven ) {
743
        if ( roundingMode == float_round_to_zero ) {
744
            increment = 0;
745
        }
746
        else {
747
            if ( zSign ) {
748
                increment = ( roundingMode == float_round_down ) && zSig1;
749
            }
750
            else {
751
                increment = ( roundingMode == float_round_up ) && zSig1;
752
            }
753
        }
754
    }
755
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
756
        if (    ( 0x7FFE < zExp )
757
             || (    ( zExp == 0x7FFE )
758
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
759
                  && increment
760
                )
761
           ) {
762
            roundMask = 0;
763
 overflow:
764
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
765
            if (    ( roundingMode == float_round_to_zero )
766
                 || ( zSign && ( roundingMode == float_round_up ) )
767
                 || ( ! zSign && ( roundingMode == float_round_down ) )
768
               ) {
769
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
770
            }
771
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
772
        }
773
        if ( zExp <= 0 ) {
774
            isTiny =
775
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
776
                || ( zExp < 0 )
777
                || ! increment
778
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
779
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
780
            zExp = 0;
781
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
782
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
783
            if ( roundNearestEven ) {
784
                increment = ( (int64_t) zSig1 < 0 );
785
            }
786
            else {
787
                if ( zSign ) {
788
                    increment = ( roundingMode == float_round_down ) && zSig1;
789
                }
790
                else {
791
                    increment = ( roundingMode == float_round_up ) && zSig1;
792
                }
793
            }
794
            if ( increment ) {
795
                ++zSig0;
796
                zSig0 &=
797
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
798
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
799
            }
800
            return packFloatx80( zSign, zExp, zSig0 );
801
        }
802
    }
803
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
804
    if ( increment ) {
805
        ++zSig0;
806
        if ( zSig0 == 0 ) {
807
            ++zExp;
808
            zSig0 = LIT64( 0x8000000000000000 );
809
        }
810
        else {
811
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
812
        }
813
    }
814
    else {
815
        if ( zSig0 == 0 ) zExp = 0;
816
    }
817
    return packFloatx80( zSign, zExp, zSig0 );
818

    
819
}
820

    
821
/*----------------------------------------------------------------------------
822
| Takes an abstract floating-point value having sign `zSign', exponent
823
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
824
| and returns the proper extended double-precision floating-point value
825
| corresponding to the abstract input.  This routine is just like
826
| `roundAndPackFloatx80' except that the input significand does not have to be
827
| normalized.
828
*----------------------------------------------------------------------------*/
829

    
830
static floatx80
831
 normalizeRoundAndPackFloatx80(
832
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
833
 STATUS_PARAM)
834
{
835
    int8 shiftCount;
836

    
837
    if ( zSig0 == 0 ) {
838
        zSig0 = zSig1;
839
        zSig1 = 0;
840
        zExp -= 64;
841
    }
842
    shiftCount = countLeadingZeros64( zSig0 );
843
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
844
    zExp -= shiftCount;
845
    return
846
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
847

    
848
}
849

    
850
/*----------------------------------------------------------------------------
851
| Returns the least-significant 64 fraction bits of the quadruple-precision
852
| floating-point value `a'.
853
*----------------------------------------------------------------------------*/
854

    
855
INLINE uint64_t extractFloat128Frac1( float128 a )
856
{
857

    
858
    return a.low;
859

    
860
}
861

    
862
/*----------------------------------------------------------------------------
863
| Returns the most-significant 48 fraction bits of the quadruple-precision
864
| floating-point value `a'.
865
*----------------------------------------------------------------------------*/
866

    
867
INLINE uint64_t extractFloat128Frac0( float128 a )
868
{
869

    
870
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
871

    
872
}
873

    
874
/*----------------------------------------------------------------------------
875
| Returns the exponent bits of the quadruple-precision floating-point value
876
| `a'.
877
*----------------------------------------------------------------------------*/
878

    
879
INLINE int32 extractFloat128Exp( float128 a )
880
{
881

    
882
    return ( a.high>>48 ) & 0x7FFF;
883

    
884
}
885

    
886
/*----------------------------------------------------------------------------
887
| Returns the sign bit of the quadruple-precision floating-point value `a'.
888
*----------------------------------------------------------------------------*/
889

    
890
INLINE flag extractFloat128Sign( float128 a )
891
{
892

    
893
    return a.high>>63;
894

    
895
}
896

    
897
/*----------------------------------------------------------------------------
898
| Normalizes the subnormal quadruple-precision floating-point value
899
| represented by the denormalized significand formed by the concatenation of
900
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
901
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
902
| significand are stored at the location pointed to by `zSig0Ptr', and the
903
| least significant 64 bits of the normalized significand are stored at the
904
| location pointed to by `zSig1Ptr'.
905
*----------------------------------------------------------------------------*/
906

    
907
static void
908
 normalizeFloat128Subnormal(
909
     uint64_t aSig0,
910
     uint64_t aSig1,
911
     int32 *zExpPtr,
912
     uint64_t *zSig0Ptr,
913
     uint64_t *zSig1Ptr
914
 )
915
{
916
    int8 shiftCount;
917

    
918
    if ( aSig0 == 0 ) {
919
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
920
        if ( shiftCount < 0 ) {
921
            *zSig0Ptr = aSig1>>( - shiftCount );
922
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
923
        }
924
        else {
925
            *zSig0Ptr = aSig1<<shiftCount;
926
            *zSig1Ptr = 0;
927
        }
928
        *zExpPtr = - shiftCount - 63;
929
    }
930
    else {
931
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
932
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
933
        *zExpPtr = 1 - shiftCount;
934
    }
935

    
936
}
937

    
938
/*----------------------------------------------------------------------------
939
| Packs the sign `zSign', the exponent `zExp', and the significand formed
940
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941
| floating-point value, returning the result.  After being shifted into the
942
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943
| added together to form the most significant 32 bits of the result.  This
944
| means that any integer portion of `zSig0' will be added into the exponent.
945
| Since a properly normalized significand will have an integer portion equal
946
| to 1, the `zExp' input should be 1 less than the desired result exponent
947
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
948
| significand.
949
*----------------------------------------------------------------------------*/
950

    
951
INLINE float128
952
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
953
{
954
    float128 z;
955

    
956
    z.low = zSig1;
957
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
958
    return z;
959

    
960
}
961

    
962
/*----------------------------------------------------------------------------
963
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
964
| and extended significand formed by the concatenation of `zSig0', `zSig1',
965
| and `zSig2', and returns the proper quadruple-precision floating-point value
966
| corresponding to the abstract input.  Ordinarily, the abstract value is
967
| simply rounded and packed into the quadruple-precision format, with the
968
| inexact exception raised if the abstract input cannot be represented
969
| exactly.  However, if the abstract value is too large, the overflow and
970
| inexact exceptions are raised and an infinity or maximal finite value is
971
| returned.  If the abstract value is too small, the input value is rounded to
972
| a subnormal number, and the underflow and inexact exceptions are raised if
973
| the abstract input cannot be represented exactly as a subnormal quadruple-
974
| precision floating-point number.
975
|     The input significand must be normalized or smaller.  If the input
976
| significand is not normalized, `zExp' must be 0; in that case, the result
977
| returned is a subnormal number, and it must not require rounding.  In the
978
| usual case that the input significand is normalized, `zExp' must be 1 less
979
| than the ``true'' floating-point exponent.  The handling of underflow and
980
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
981
*----------------------------------------------------------------------------*/
982

    
983
static float128
984
 roundAndPackFloat128(
985
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
986
{
987
    int8 roundingMode;
988
    flag roundNearestEven, increment, isTiny;
989

    
990
    roundingMode = STATUS(float_rounding_mode);
991
    roundNearestEven = ( roundingMode == float_round_nearest_even );
992
    increment = ( (int64_t) zSig2 < 0 );
993
    if ( ! roundNearestEven ) {
994
        if ( roundingMode == float_round_to_zero ) {
995
            increment = 0;
996
        }
997
        else {
998
            if ( zSign ) {
999
                increment = ( roundingMode == float_round_down ) && zSig2;
1000
            }
1001
            else {
1002
                increment = ( roundingMode == float_round_up ) && zSig2;
1003
            }
1004
        }
1005
    }
1006
    if ( 0x7FFD <= (uint32_t) zExp ) {
1007
        if (    ( 0x7FFD < zExp )
1008
             || (    ( zExp == 0x7FFD )
1009
                  && eq128(
1010
                         LIT64( 0x0001FFFFFFFFFFFF ),
1011
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1012
                         zSig0,
1013
                         zSig1
1014
                     )
1015
                  && increment
1016
                )
1017
           ) {
1018
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1019
            if (    ( roundingMode == float_round_to_zero )
1020
                 || ( zSign && ( roundingMode == float_round_up ) )
1021
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1022
               ) {
1023
                return
1024
                    packFloat128(
1025
                        zSign,
1026
                        0x7FFE,
1027
                        LIT64( 0x0000FFFFFFFFFFFF ),
1028
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1029
                    );
1030
            }
1031
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1032
        }
1033
        if ( zExp < 0 ) {
1034
            if (STATUS(flush_to_zero)) {
1035
                float_raise(float_flag_output_denormal STATUS_VAR);
1036
                return packFloat128(zSign, 0, 0, 0);
1037
            }
1038
            isTiny =
1039
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1040
                || ( zExp < -1 )
1041
                || ! increment
1042
                || lt128(
1043
                       zSig0,
1044
                       zSig1,
1045
                       LIT64( 0x0001FFFFFFFFFFFF ),
1046
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1047
                   );
1048
            shift128ExtraRightJamming(
1049
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1050
            zExp = 0;
1051
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1052
            if ( roundNearestEven ) {
1053
                increment = ( (int64_t) zSig2 < 0 );
1054
            }
1055
            else {
1056
                if ( zSign ) {
1057
                    increment = ( roundingMode == float_round_down ) && zSig2;
1058
                }
1059
                else {
1060
                    increment = ( roundingMode == float_round_up ) && zSig2;
1061
                }
1062
            }
1063
        }
1064
    }
1065
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1066
    if ( increment ) {
1067
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1068
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1069
    }
1070
    else {
1071
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1072
    }
1073
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1074

    
1075
}
1076

    
1077
/*----------------------------------------------------------------------------
1078
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1079
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1080
| returns the proper quadruple-precision floating-point value corresponding
1081
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1082
| except that the input significand has fewer bits and does not have to be
1083
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1084
| point exponent.
1085
*----------------------------------------------------------------------------*/
1086

    
1087
static float128
1088
 normalizeRoundAndPackFloat128(
1089
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1090
{
1091
    int8 shiftCount;
1092
    uint64_t zSig2;
1093

    
1094
    if ( zSig0 == 0 ) {
1095
        zSig0 = zSig1;
1096
        zSig1 = 0;
1097
        zExp -= 64;
1098
    }
1099
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1100
    if ( 0 <= shiftCount ) {
1101
        zSig2 = 0;
1102
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1103
    }
1104
    else {
1105
        shift128ExtraRightJamming(
1106
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1107
    }
1108
    zExp -= shiftCount;
1109
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1110

    
1111
}
1112

    
1113
/*----------------------------------------------------------------------------
1114
| Returns the result of converting the 32-bit two's complement integer `a'
1115
| to the single-precision floating-point format.  The conversion is performed
1116
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1117
*----------------------------------------------------------------------------*/
1118

    
1119
float32 int32_to_float32( int32 a STATUS_PARAM )
1120
{
1121
    flag zSign;
1122

    
1123
    if ( a == 0 ) return float32_zero;
1124
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1125
    zSign = ( a < 0 );
1126
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1127

    
1128
}
1129

    
1130
/*----------------------------------------------------------------------------
1131
| Returns the result of converting the 32-bit two's complement integer `a'
1132
| to the double-precision floating-point format.  The conversion is performed
1133
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1134
*----------------------------------------------------------------------------*/
1135

    
1136
float64 int32_to_float64( int32 a STATUS_PARAM )
1137
{
1138
    flag zSign;
1139
    uint32 absA;
1140
    int8 shiftCount;
1141
    uint64_t zSig;
1142

    
1143
    if ( a == 0 ) return float64_zero;
1144
    zSign = ( a < 0 );
1145
    absA = zSign ? - a : a;
1146
    shiftCount = countLeadingZeros32( absA ) + 21;
1147
    zSig = absA;
1148
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1149

    
1150
}
1151

    
1152
/*----------------------------------------------------------------------------
1153
| Returns the result of converting the 32-bit two's complement integer `a'
1154
| to the extended double-precision floating-point format.  The conversion
1155
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156
| Arithmetic.
1157
*----------------------------------------------------------------------------*/
1158

    
1159
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160
{
1161
    flag zSign;
1162
    uint32 absA;
1163
    int8 shiftCount;
1164
    uint64_t zSig;
1165

    
1166
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167
    zSign = ( a < 0 );
1168
    absA = zSign ? - a : a;
1169
    shiftCount = countLeadingZeros32( absA ) + 32;
1170
    zSig = absA;
1171
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172

    
1173
}
1174

    
1175
/*----------------------------------------------------------------------------
1176
| Returns the result of converting the 32-bit two's complement integer `a' to
1177
| the quadruple-precision floating-point format.  The conversion is performed
1178
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1179
*----------------------------------------------------------------------------*/
1180

    
1181
float128 int32_to_float128( int32 a STATUS_PARAM )
1182
{
1183
    flag zSign;
1184
    uint32 absA;
1185
    int8 shiftCount;
1186
    uint64_t zSig0;
1187

    
1188
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1189
    zSign = ( a < 0 );
1190
    absA = zSign ? - a : a;
1191
    shiftCount = countLeadingZeros32( absA ) + 17;
1192
    zSig0 = absA;
1193
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1194

    
1195
}
1196

    
1197
/*----------------------------------------------------------------------------
1198
| Returns the result of converting the 64-bit two's complement integer `a'
1199
| to the single-precision floating-point format.  The conversion is performed
1200
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1201
*----------------------------------------------------------------------------*/
1202

    
1203
float32 int64_to_float32( int64 a STATUS_PARAM )
1204
{
1205
    flag zSign;
1206
    uint64 absA;
1207
    int8 shiftCount;
1208

    
1209
    if ( a == 0 ) return float32_zero;
1210
    zSign = ( a < 0 );
1211
    absA = zSign ? - a : a;
1212
    shiftCount = countLeadingZeros64( absA ) - 40;
1213
    if ( 0 <= shiftCount ) {
1214
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1215
    }
1216
    else {
1217
        shiftCount += 7;
1218
        if ( shiftCount < 0 ) {
1219
            shift64RightJamming( absA, - shiftCount, &absA );
1220
        }
1221
        else {
1222
            absA <<= shiftCount;
1223
        }
1224
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1225
    }
1226

    
1227
}
1228

    
1229
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1230
{
1231
    int8 shiftCount;
1232

    
1233
    if ( a == 0 ) return float32_zero;
1234
    shiftCount = countLeadingZeros64( a ) - 40;
1235
    if ( 0 <= shiftCount ) {
1236
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1237
    }
1238
    else {
1239
        shiftCount += 7;
1240
        if ( shiftCount < 0 ) {
1241
            shift64RightJamming( a, - shiftCount, &a );
1242
        }
1243
        else {
1244
            a <<= shiftCount;
1245
        }
1246
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1247
    }
1248
}
1249

    
1250
/*----------------------------------------------------------------------------
1251
| Returns the result of converting the 64-bit two's complement integer `a'
1252
| to the double-precision floating-point format.  The conversion is performed
1253
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1254
*----------------------------------------------------------------------------*/
1255

    
1256
float64 int64_to_float64( int64 a STATUS_PARAM )
1257
{
1258
    flag zSign;
1259

    
1260
    if ( a == 0 ) return float64_zero;
1261
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1262
        return packFloat64( 1, 0x43E, 0 );
1263
    }
1264
    zSign = ( a < 0 );
1265
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1266

    
1267
}
1268

    
1269
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1270
{
1271
    if ( a == 0 ) return float64_zero;
1272
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1273

    
1274
}
1275

    
1276
/*----------------------------------------------------------------------------
1277
| Returns the result of converting the 64-bit two's complement integer `a'
1278
| to the extended double-precision floating-point format.  The conversion
1279
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1280
| Arithmetic.
1281
*----------------------------------------------------------------------------*/
1282

    
1283
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1284
{
1285
    flag zSign;
1286
    uint64 absA;
1287
    int8 shiftCount;
1288

    
1289
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1290
    zSign = ( a < 0 );
1291
    absA = zSign ? - a : a;
1292
    shiftCount = countLeadingZeros64( absA );
1293
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1294

    
1295
}
1296

    
1297
/*----------------------------------------------------------------------------
1298
| Returns the result of converting the 64-bit two's complement integer `a' to
1299
| the quadruple-precision floating-point format.  The conversion is performed
1300
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301
*----------------------------------------------------------------------------*/
1302

    
1303
float128 int64_to_float128( int64 a STATUS_PARAM )
1304
{
1305
    flag zSign;
1306
    uint64 absA;
1307
    int8 shiftCount;
1308
    int32 zExp;
1309
    uint64_t zSig0, zSig1;
1310

    
1311
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1312
    zSign = ( a < 0 );
1313
    absA = zSign ? - a : a;
1314
    shiftCount = countLeadingZeros64( absA ) + 49;
1315
    zExp = 0x406E - shiftCount;
1316
    if ( 64 <= shiftCount ) {
1317
        zSig1 = 0;
1318
        zSig0 = absA;
1319
        shiftCount -= 64;
1320
    }
1321
    else {
1322
        zSig1 = absA;
1323
        zSig0 = 0;
1324
    }
1325
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1326
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1327

    
1328
}
1329

    
1330
/*----------------------------------------------------------------------------
1331
| Returns the result of converting the single-precision floating-point value
1332
| `a' to the 32-bit two's complement integer format.  The conversion is
1333
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1334
| Arithmetic---which means in particular that the conversion is rounded
1335
| according to the current rounding mode.  If `a' is a NaN, the largest
1336
| positive integer is returned.  Otherwise, if the conversion overflows, the
1337
| largest integer with the same sign as `a' is returned.
1338
*----------------------------------------------------------------------------*/
1339

    
1340
int32 float32_to_int32( float32 a STATUS_PARAM )
1341
{
1342
    flag aSign;
1343
    int16 aExp, shiftCount;
1344
    uint32_t aSig;
1345
    uint64_t aSig64;
1346

    
1347
    a = float32_squash_input_denormal(a STATUS_VAR);
1348
    aSig = extractFloat32Frac( a );
1349
    aExp = extractFloat32Exp( a );
1350
    aSign = extractFloat32Sign( a );
1351
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1352
    if ( aExp ) aSig |= 0x00800000;
1353
    shiftCount = 0xAF - aExp;
1354
    aSig64 = aSig;
1355
    aSig64 <<= 32;
1356
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1357
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1358

    
1359
}
1360

    
1361
/*----------------------------------------------------------------------------
1362
| Returns the result of converting the single-precision floating-point value
1363
| `a' to the 32-bit two's complement integer format.  The conversion is
1364
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1365
| Arithmetic, except that the conversion is always rounded toward zero.
1366
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1367
| the conversion overflows, the largest integer with the same sign as `a' is
1368
| returned.
1369
*----------------------------------------------------------------------------*/
1370

    
1371
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1372
{
1373
    flag aSign;
1374
    int16 aExp, shiftCount;
1375
    uint32_t aSig;
1376
    int32 z;
1377
    a = float32_squash_input_denormal(a STATUS_VAR);
1378

    
1379
    aSig = extractFloat32Frac( a );
1380
    aExp = extractFloat32Exp( a );
1381
    aSign = extractFloat32Sign( a );
1382
    shiftCount = aExp - 0x9E;
1383
    if ( 0 <= shiftCount ) {
1384
        if ( float32_val(a) != 0xCF000000 ) {
1385
            float_raise( float_flag_invalid STATUS_VAR);
1386
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1387
        }
1388
        return (int32_t) 0x80000000;
1389
    }
1390
    else if ( aExp <= 0x7E ) {
1391
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1392
        return 0;
1393
    }
1394
    aSig = ( aSig | 0x00800000 )<<8;
1395
    z = aSig>>( - shiftCount );
1396
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1397
        STATUS(float_exception_flags) |= float_flag_inexact;
1398
    }
1399
    if ( aSign ) z = - z;
1400
    return z;
1401

    
1402
}
1403

    
1404
/*----------------------------------------------------------------------------
1405
| Returns the result of converting the single-precision floating-point value
1406
| `a' to the 16-bit two's complement integer format.  The conversion is
1407
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1408
| Arithmetic, except that the conversion is always rounded toward zero.
1409
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1410
| the conversion overflows, the largest integer with the same sign as `a' is
1411
| returned.
1412
*----------------------------------------------------------------------------*/
1413

    
1414
int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1415
{
1416
    flag aSign;
1417
    int16 aExp, shiftCount;
1418
    uint32_t aSig;
1419
    int32 z;
1420

    
1421
    aSig = extractFloat32Frac( a );
1422
    aExp = extractFloat32Exp( a );
1423
    aSign = extractFloat32Sign( a );
1424
    shiftCount = aExp - 0x8E;
1425
    if ( 0 <= shiftCount ) {
1426
        if ( float32_val(a) != 0xC7000000 ) {
1427
            float_raise( float_flag_invalid STATUS_VAR);
1428
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1429
                return 0x7FFF;
1430
            }
1431
        }
1432
        return (int32_t) 0xffff8000;
1433
    }
1434
    else if ( aExp <= 0x7E ) {
1435
        if ( aExp | aSig ) {
1436
            STATUS(float_exception_flags) |= float_flag_inexact;
1437
        }
1438
        return 0;
1439
    }
1440
    shiftCount -= 0x10;
1441
    aSig = ( aSig | 0x00800000 )<<8;
1442
    z = aSig>>( - shiftCount );
1443
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1444
        STATUS(float_exception_flags) |= float_flag_inexact;
1445
    }
1446
    if ( aSign ) {
1447
        z = - z;
1448
    }
1449
    return z;
1450

    
1451
}
1452

    
1453
/*----------------------------------------------------------------------------
1454
| Returns the result of converting the single-precision floating-point value
1455
| `a' to the 64-bit two's complement integer format.  The conversion is
1456
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1457
| Arithmetic---which means in particular that the conversion is rounded
1458
| according to the current rounding mode.  If `a' is a NaN, the largest
1459
| positive integer is returned.  Otherwise, if the conversion overflows, the
1460
| largest integer with the same sign as `a' is returned.
1461
*----------------------------------------------------------------------------*/
1462

    
1463
int64 float32_to_int64( float32 a STATUS_PARAM )
1464
{
1465
    flag aSign;
1466
    int16 aExp, shiftCount;
1467
    uint32_t aSig;
1468
    uint64_t aSig64, aSigExtra;
1469
    a = float32_squash_input_denormal(a STATUS_VAR);
1470

    
1471
    aSig = extractFloat32Frac( a );
1472
    aExp = extractFloat32Exp( a );
1473
    aSign = extractFloat32Sign( a );
1474
    shiftCount = 0xBE - aExp;
1475
    if ( shiftCount < 0 ) {
1476
        float_raise( float_flag_invalid STATUS_VAR);
1477
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1478
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1479
        }
1480
        return (int64_t) LIT64( 0x8000000000000000 );
1481
    }
1482
    if ( aExp ) aSig |= 0x00800000;
1483
    aSig64 = aSig;
1484
    aSig64 <<= 40;
1485
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1486
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1487

    
1488
}
1489

    
1490
/*----------------------------------------------------------------------------
1491
| Returns the result of converting the single-precision floating-point value
1492
| `a' to the 64-bit two's complement integer format.  The conversion is
1493
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1494
| Arithmetic, except that the conversion is always rounded toward zero.  If
1495
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1496
| conversion overflows, the largest integer with the same sign as `a' is
1497
| returned.
1498
*----------------------------------------------------------------------------*/
1499

    
1500
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1501
{
1502
    flag aSign;
1503
    int16 aExp, shiftCount;
1504
    uint32_t aSig;
1505
    uint64_t aSig64;
1506
    int64 z;
1507
    a = float32_squash_input_denormal(a STATUS_VAR);
1508

    
1509
    aSig = extractFloat32Frac( a );
1510
    aExp = extractFloat32Exp( a );
1511
    aSign = extractFloat32Sign( a );
1512
    shiftCount = aExp - 0xBE;
1513
    if ( 0 <= shiftCount ) {
1514
        if ( float32_val(a) != 0xDF000000 ) {
1515
            float_raise( float_flag_invalid STATUS_VAR);
1516
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1517
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1518
            }
1519
        }
1520
        return (int64_t) LIT64( 0x8000000000000000 );
1521
    }
1522
    else if ( aExp <= 0x7E ) {
1523
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1524
        return 0;
1525
    }
1526
    aSig64 = aSig | 0x00800000;
1527
    aSig64 <<= 40;
1528
    z = aSig64>>( - shiftCount );
1529
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1530
        STATUS(float_exception_flags) |= float_flag_inexact;
1531
    }
1532
    if ( aSign ) z = - z;
1533
    return z;
1534

    
1535
}
1536

    
1537
/*----------------------------------------------------------------------------
1538
| Returns the result of converting the single-precision floating-point value
1539
| `a' to the double-precision floating-point format.  The conversion is
1540
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1541
| Arithmetic.
1542
*----------------------------------------------------------------------------*/
1543

    
1544
float64 float32_to_float64( float32 a STATUS_PARAM )
1545
{
1546
    flag aSign;
1547
    int16 aExp;
1548
    uint32_t aSig;
1549
    a = float32_squash_input_denormal(a STATUS_VAR);
1550

    
1551
    aSig = extractFloat32Frac( a );
1552
    aExp = extractFloat32Exp( a );
1553
    aSign = extractFloat32Sign( a );
1554
    if ( aExp == 0xFF ) {
1555
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1556
        return packFloat64( aSign, 0x7FF, 0 );
1557
    }
1558
    if ( aExp == 0 ) {
1559
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1560
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1561
        --aExp;
1562
    }
1563
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1564

    
1565
}
1566

    
1567
/*----------------------------------------------------------------------------
1568
| Returns the result of converting the single-precision floating-point value
1569
| `a' to the extended double-precision floating-point format.  The conversion
1570
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1571
| Arithmetic.
1572
*----------------------------------------------------------------------------*/
1573

    
1574
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1575
{
1576
    flag aSign;
1577
    int16 aExp;
1578
    uint32_t aSig;
1579

    
1580
    a = float32_squash_input_denormal(a STATUS_VAR);
1581
    aSig = extractFloat32Frac( a );
1582
    aExp = extractFloat32Exp( a );
1583
    aSign = extractFloat32Sign( a );
1584
    if ( aExp == 0xFF ) {
1585
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1586
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1587
    }
1588
    if ( aExp == 0 ) {
1589
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1590
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1591
    }
1592
    aSig |= 0x00800000;
1593
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1594

    
1595
}
1596

    
1597
/*----------------------------------------------------------------------------
1598
| Returns the result of converting the single-precision floating-point value
1599
| `a' to the double-precision floating-point format.  The conversion is
1600
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1601
| Arithmetic.
1602
*----------------------------------------------------------------------------*/
1603

    
1604
float128 float32_to_float128( float32 a STATUS_PARAM )
1605
{
1606
    flag aSign;
1607
    int16 aExp;
1608
    uint32_t aSig;
1609

    
1610
    a = float32_squash_input_denormal(a STATUS_VAR);
1611
    aSig = extractFloat32Frac( a );
1612
    aExp = extractFloat32Exp( a );
1613
    aSign = extractFloat32Sign( a );
1614
    if ( aExp == 0xFF ) {
1615
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1616
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1617
    }
1618
    if ( aExp == 0 ) {
1619
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1620
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1621
        --aExp;
1622
    }
1623
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1624

    
1625
}
1626

    
1627
/*----------------------------------------------------------------------------
1628
| Rounds the single-precision floating-point value `a' to an integer, and
1629
| returns the result as a single-precision floating-point value.  The
1630
| operation is performed according to the IEC/IEEE Standard for Binary
1631
| Floating-Point Arithmetic.
1632
*----------------------------------------------------------------------------*/
1633

    
1634
float32 float32_round_to_int( float32 a STATUS_PARAM)
1635
{
1636
    flag aSign;
1637
    int16 aExp;
1638
    uint32_t lastBitMask, roundBitsMask;
1639
    int8 roundingMode;
1640
    uint32_t z;
1641
    a = float32_squash_input_denormal(a STATUS_VAR);
1642

    
1643
    aExp = extractFloat32Exp( a );
1644
    if ( 0x96 <= aExp ) {
1645
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1646
            return propagateFloat32NaN( a, a STATUS_VAR );
1647
        }
1648
        return a;
1649
    }
1650
    if ( aExp <= 0x7E ) {
1651
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1652
        STATUS(float_exception_flags) |= float_flag_inexact;
1653
        aSign = extractFloat32Sign( a );
1654
        switch ( STATUS(float_rounding_mode) ) {
1655
         case float_round_nearest_even:
1656
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1657
                return packFloat32( aSign, 0x7F, 0 );
1658
            }
1659
            break;
1660
         case float_round_down:
1661
            return make_float32(aSign ? 0xBF800000 : 0);
1662
         case float_round_up:
1663
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1664
        }
1665
        return packFloat32( aSign, 0, 0 );
1666
    }
1667
    lastBitMask = 1;
1668
    lastBitMask <<= 0x96 - aExp;
1669
    roundBitsMask = lastBitMask - 1;
1670
    z = float32_val(a);
1671
    roundingMode = STATUS(float_rounding_mode);
1672
    if ( roundingMode == float_round_nearest_even ) {
1673
        z += lastBitMask>>1;
1674
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1675
    }
1676
    else if ( roundingMode != float_round_to_zero ) {
1677
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1678
            z += roundBitsMask;
1679
        }
1680
    }
1681
    z &= ~ roundBitsMask;
1682
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1683
    return make_float32(z);
1684

    
1685
}
1686

    
1687
/*----------------------------------------------------------------------------
1688
| Returns the result of adding the absolute values of the single-precision
1689
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1690
| before being returned.  `zSign' is ignored if the result is a NaN.
1691
| The addition is performed according to the IEC/IEEE Standard for Binary
1692
| Floating-Point Arithmetic.
1693
*----------------------------------------------------------------------------*/
1694

    
1695
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1696
{
1697
    int16 aExp, bExp, zExp;
1698
    uint32_t aSig, bSig, zSig;
1699
    int16 expDiff;
1700

    
1701
    aSig = extractFloat32Frac( a );
1702
    aExp = extractFloat32Exp( a );
1703
    bSig = extractFloat32Frac( b );
1704
    bExp = extractFloat32Exp( b );
1705
    expDiff = aExp - bExp;
1706
    aSig <<= 6;
1707
    bSig <<= 6;
1708
    if ( 0 < expDiff ) {
1709
        if ( aExp == 0xFF ) {
1710
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711
            return a;
1712
        }
1713
        if ( bExp == 0 ) {
1714
            --expDiff;
1715
        }
1716
        else {
1717
            bSig |= 0x20000000;
1718
        }
1719
        shift32RightJamming( bSig, expDiff, &bSig );
1720
        zExp = aExp;
1721
    }
1722
    else if ( expDiff < 0 ) {
1723
        if ( bExp == 0xFF ) {
1724
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1725
            return packFloat32( zSign, 0xFF, 0 );
1726
        }
1727
        if ( aExp == 0 ) {
1728
            ++expDiff;
1729
        }
1730
        else {
1731
            aSig |= 0x20000000;
1732
        }
1733
        shift32RightJamming( aSig, - expDiff, &aSig );
1734
        zExp = bExp;
1735
    }
1736
    else {
1737
        if ( aExp == 0xFF ) {
1738
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1739
            return a;
1740
        }
1741
        if ( aExp == 0 ) {
1742
            if (STATUS(flush_to_zero)) {
1743
                if (aSig | bSig) {
1744
                    float_raise(float_flag_output_denormal STATUS_VAR);
1745
                }
1746
                return packFloat32(zSign, 0, 0);
1747
            }
1748
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1749
        }
1750
        zSig = 0x40000000 + aSig + bSig;
1751
        zExp = aExp;
1752
        goto roundAndPack;
1753
    }
1754
    aSig |= 0x20000000;
1755
    zSig = ( aSig + bSig )<<1;
1756
    --zExp;
1757
    if ( (int32_t) zSig < 0 ) {
1758
        zSig = aSig + bSig;
1759
        ++zExp;
1760
    }
1761
 roundAndPack:
1762
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1763

    
1764
}
1765

    
1766
/*----------------------------------------------------------------------------
1767
| Returns the result of subtracting the absolute values of the single-
1768
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1769
| difference is negated before being returned.  `zSign' is ignored if the
1770
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1771
| Standard for Binary Floating-Point Arithmetic.
1772
*----------------------------------------------------------------------------*/
1773

    
1774
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1775
{
1776
    int16 aExp, bExp, zExp;
1777
    uint32_t aSig, bSig, zSig;
1778
    int16 expDiff;
1779

    
1780
    aSig = extractFloat32Frac( a );
1781
    aExp = extractFloat32Exp( a );
1782
    bSig = extractFloat32Frac( b );
1783
    bExp = extractFloat32Exp( b );
1784
    expDiff = aExp - bExp;
1785
    aSig <<= 7;
1786
    bSig <<= 7;
1787
    if ( 0 < expDiff ) goto aExpBigger;
1788
    if ( expDiff < 0 ) goto bExpBigger;
1789
    if ( aExp == 0xFF ) {
1790
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1791
        float_raise( float_flag_invalid STATUS_VAR);
1792
        return float32_default_nan;
1793
    }
1794
    if ( aExp == 0 ) {
1795
        aExp = 1;
1796
        bExp = 1;
1797
    }
1798
    if ( bSig < aSig ) goto aBigger;
1799
    if ( aSig < bSig ) goto bBigger;
1800
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1801
 bExpBigger:
1802
    if ( bExp == 0xFF ) {
1803
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1804
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1805
    }
1806
    if ( aExp == 0 ) {
1807
        ++expDiff;
1808
    }
1809
    else {
1810
        aSig |= 0x40000000;
1811
    }
1812
    shift32RightJamming( aSig, - expDiff, &aSig );
1813
    bSig |= 0x40000000;
1814
 bBigger:
1815
    zSig = bSig - aSig;
1816
    zExp = bExp;
1817
    zSign ^= 1;
1818
    goto normalizeRoundAndPack;
1819
 aExpBigger:
1820
    if ( aExp == 0xFF ) {
1821
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1822
        return a;
1823
    }
1824
    if ( bExp == 0 ) {
1825
        --expDiff;
1826
    }
1827
    else {
1828
        bSig |= 0x40000000;
1829
    }
1830
    shift32RightJamming( bSig, expDiff, &bSig );
1831
    aSig |= 0x40000000;
1832
 aBigger:
1833
    zSig = aSig - bSig;
1834
    zExp = aExp;
1835
 normalizeRoundAndPack:
1836
    --zExp;
1837
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1838

    
1839
}
1840

    
1841
/*----------------------------------------------------------------------------
1842
| Returns the result of adding the single-precision floating-point values `a'
1843
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1844
| Binary Floating-Point Arithmetic.
1845
*----------------------------------------------------------------------------*/
1846

    
1847
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1848
{
1849
    flag aSign, bSign;
1850
    a = float32_squash_input_denormal(a STATUS_VAR);
1851
    b = float32_squash_input_denormal(b STATUS_VAR);
1852

    
1853
    aSign = extractFloat32Sign( a );
1854
    bSign = extractFloat32Sign( b );
1855
    if ( aSign == bSign ) {
1856
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1857
    }
1858
    else {
1859
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1860
    }
1861

    
1862
}
1863

    
1864
/*----------------------------------------------------------------------------
1865
| Returns the result of subtracting the single-precision floating-point values
1866
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1867
| for Binary Floating-Point Arithmetic.
1868
*----------------------------------------------------------------------------*/
1869

    
1870
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1871
{
1872
    flag aSign, bSign;
1873
    a = float32_squash_input_denormal(a STATUS_VAR);
1874
    b = float32_squash_input_denormal(b STATUS_VAR);
1875

    
1876
    aSign = extractFloat32Sign( a );
1877
    bSign = extractFloat32Sign( b );
1878
    if ( aSign == bSign ) {
1879
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1880
    }
1881
    else {
1882
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1883
    }
1884

    
1885
}
1886

    
1887
/*----------------------------------------------------------------------------
1888
| Returns the result of multiplying the single-precision floating-point values
1889
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1890
| for Binary Floating-Point Arithmetic.
1891
*----------------------------------------------------------------------------*/
1892

    
1893
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1894
{
1895
    flag aSign, bSign, zSign;
1896
    int16 aExp, bExp, zExp;
1897
    uint32_t aSig, bSig;
1898
    uint64_t zSig64;
1899
    uint32_t zSig;
1900

    
1901
    a = float32_squash_input_denormal(a STATUS_VAR);
1902
    b = float32_squash_input_denormal(b STATUS_VAR);
1903

    
1904
    aSig = extractFloat32Frac( a );
1905
    aExp = extractFloat32Exp( a );
1906
    aSign = extractFloat32Sign( a );
1907
    bSig = extractFloat32Frac( b );
1908
    bExp = extractFloat32Exp( b );
1909
    bSign = extractFloat32Sign( b );
1910
    zSign = aSign ^ bSign;
1911
    if ( aExp == 0xFF ) {
1912
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1913
            return propagateFloat32NaN( a, b STATUS_VAR );
1914
        }
1915
        if ( ( bExp | bSig ) == 0 ) {
1916
            float_raise( float_flag_invalid STATUS_VAR);
1917
            return float32_default_nan;
1918
        }
1919
        return packFloat32( zSign, 0xFF, 0 );
1920
    }
1921
    if ( bExp == 0xFF ) {
1922
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923
        if ( ( aExp | aSig ) == 0 ) {
1924
            float_raise( float_flag_invalid STATUS_VAR);
1925
            return float32_default_nan;
1926
        }
1927
        return packFloat32( zSign, 0xFF, 0 );
1928
    }
1929
    if ( aExp == 0 ) {
1930
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1931
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1932
    }
1933
    if ( bExp == 0 ) {
1934
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1935
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1936
    }
1937
    zExp = aExp + bExp - 0x7F;
1938
    aSig = ( aSig | 0x00800000 )<<7;
1939
    bSig = ( bSig | 0x00800000 )<<8;
1940
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1941
    zSig = zSig64;
1942
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1943
        zSig <<= 1;
1944
        --zExp;
1945
    }
1946
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1947

    
1948
}
1949

    
1950
/*----------------------------------------------------------------------------
1951
| Returns the result of dividing the single-precision floating-point value `a'
1952
| by the corresponding value `b'.  The operation is performed according to the
1953
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1954
*----------------------------------------------------------------------------*/
1955

    
1956
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1957
{
1958
    flag aSign, bSign, zSign;
1959
    int16 aExp, bExp, zExp;
1960
    uint32_t aSig, bSig, zSig;
1961
    a = float32_squash_input_denormal(a STATUS_VAR);
1962
    b = float32_squash_input_denormal(b STATUS_VAR);
1963

    
1964
    aSig = extractFloat32Frac( a );
1965
    aExp = extractFloat32Exp( a );
1966
    aSign = extractFloat32Sign( a );
1967
    bSig = extractFloat32Frac( b );
1968
    bExp = extractFloat32Exp( b );
1969
    bSign = extractFloat32Sign( b );
1970
    zSign = aSign ^ bSign;
1971
    if ( aExp == 0xFF ) {
1972
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1973
        if ( bExp == 0xFF ) {
1974
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1975
            float_raise( float_flag_invalid STATUS_VAR);
1976
            return float32_default_nan;
1977
        }
1978
        return packFloat32( zSign, 0xFF, 0 );
1979
    }
1980
    if ( bExp == 0xFF ) {
1981
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982
        return packFloat32( zSign, 0, 0 );
1983
    }
1984
    if ( bExp == 0 ) {
1985
        if ( bSig == 0 ) {
1986
            if ( ( aExp | aSig ) == 0 ) {
1987
                float_raise( float_flag_invalid STATUS_VAR);
1988
                return float32_default_nan;
1989
            }
1990
            float_raise( float_flag_divbyzero STATUS_VAR);
1991
            return packFloat32( zSign, 0xFF, 0 );
1992
        }
1993
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1994
    }
1995
    if ( aExp == 0 ) {
1996
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1997
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1998
    }
1999
    zExp = aExp - bExp + 0x7D;
2000
    aSig = ( aSig | 0x00800000 )<<7;
2001
    bSig = ( bSig | 0x00800000 )<<8;
2002
    if ( bSig <= ( aSig + aSig ) ) {
2003
        aSig >>= 1;
2004
        ++zExp;
2005
    }
2006
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2007
    if ( ( zSig & 0x3F ) == 0 ) {
2008
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2009
    }
2010
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2011

    
2012
}
2013

    
2014
/*----------------------------------------------------------------------------
2015
| Returns the remainder of the single-precision floating-point value `a'
2016
| with respect to the corresponding value `b'.  The operation is performed
2017
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018
*----------------------------------------------------------------------------*/
2019

    
2020
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2021
{
2022
    flag aSign, zSign;
2023
    int16 aExp, bExp, expDiff;
2024
    uint32_t aSig, bSig;
2025
    uint32_t q;
2026
    uint64_t aSig64, bSig64, q64;
2027
    uint32_t alternateASig;
2028
    int32_t sigMean;
2029
    a = float32_squash_input_denormal(a STATUS_VAR);
2030
    b = float32_squash_input_denormal(b STATUS_VAR);
2031

    
2032
    aSig = extractFloat32Frac( a );
2033
    aExp = extractFloat32Exp( a );
2034
    aSign = extractFloat32Sign( a );
2035
    bSig = extractFloat32Frac( b );
2036
    bExp = extractFloat32Exp( b );
2037
    if ( aExp == 0xFF ) {
2038
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2039
            return propagateFloat32NaN( a, b STATUS_VAR );
2040
        }
2041
        float_raise( float_flag_invalid STATUS_VAR);
2042
        return float32_default_nan;
2043
    }
2044
    if ( bExp == 0xFF ) {
2045
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2046
        return a;
2047
    }
2048
    if ( bExp == 0 ) {
2049
        if ( bSig == 0 ) {
2050
            float_raise( float_flag_invalid STATUS_VAR);
2051
            return float32_default_nan;
2052
        }
2053
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2054
    }
2055
    if ( aExp == 0 ) {
2056
        if ( aSig == 0 ) return a;
2057
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2058
    }
2059
    expDiff = aExp - bExp;
2060
    aSig |= 0x00800000;
2061
    bSig |= 0x00800000;
2062
    if ( expDiff < 32 ) {
2063
        aSig <<= 8;
2064
        bSig <<= 8;
2065
        if ( expDiff < 0 ) {
2066
            if ( expDiff < -1 ) return a;
2067
            aSig >>= 1;
2068
        }
2069
        q = ( bSig <= aSig );
2070
        if ( q ) aSig -= bSig;
2071
        if ( 0 < expDiff ) {
2072
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2073
            q >>= 32 - expDiff;
2074
            bSig >>= 2;
2075
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2076
        }
2077
        else {
2078
            aSig >>= 2;
2079
            bSig >>= 2;
2080
        }
2081
    }
2082
    else {
2083
        if ( bSig <= aSig ) aSig -= bSig;
2084
        aSig64 = ( (uint64_t) aSig )<<40;
2085
        bSig64 = ( (uint64_t) bSig )<<40;
2086
        expDiff -= 64;
2087
        while ( 0 < expDiff ) {
2088
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2089
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2090
            aSig64 = - ( ( bSig * q64 )<<38 );
2091
            expDiff -= 62;
2092
        }
2093
        expDiff += 64;
2094
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2095
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2096
        q = q64>>( 64 - expDiff );
2097
        bSig <<= 6;
2098
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2099
    }
2100
    do {
2101
        alternateASig = aSig;
2102
        ++q;
2103
        aSig -= bSig;
2104
    } while ( 0 <= (int32_t) aSig );
2105
    sigMean = aSig + alternateASig;
2106
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2107
        aSig = alternateASig;
2108
    }
2109
    zSign = ( (int32_t) aSig < 0 );
2110
    if ( zSign ) aSig = - aSig;
2111
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2112

    
2113
}
2114

    
2115
/*----------------------------------------------------------------------------
2116
| Returns the square root of the single-precision floating-point value `a'.
2117
| The operation is performed according to the IEC/IEEE Standard for Binary
2118
| Floating-Point Arithmetic.
2119
*----------------------------------------------------------------------------*/
2120

    
2121
float32 float32_sqrt( float32 a STATUS_PARAM )
2122
{
2123
    flag aSign;
2124
    int16 aExp, zExp;
2125
    uint32_t aSig, zSig;
2126
    uint64_t rem, term;
2127
    a = float32_squash_input_denormal(a STATUS_VAR);
2128

    
2129
    aSig = extractFloat32Frac( a );
2130
    aExp = extractFloat32Exp( a );
2131
    aSign = extractFloat32Sign( a );
2132
    if ( aExp == 0xFF ) {
2133
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2134
        if ( ! aSign ) return a;
2135
        float_raise( float_flag_invalid STATUS_VAR);
2136
        return float32_default_nan;
2137
    }
2138
    if ( aSign ) {
2139
        if ( ( aExp | aSig ) == 0 ) return a;
2140
        float_raise( float_flag_invalid STATUS_VAR);
2141
        return float32_default_nan;
2142
    }
2143
    if ( aExp == 0 ) {
2144
        if ( aSig == 0 ) return float32_zero;
2145
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2146
    }
2147
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2148
    aSig = ( aSig | 0x00800000 )<<8;
2149
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2150
    if ( ( zSig & 0x7F ) <= 5 ) {
2151
        if ( zSig < 2 ) {
2152
            zSig = 0x7FFFFFFF;
2153
            goto roundAndPack;
2154
        }
2155
        aSig >>= aExp & 1;
2156
        term = ( (uint64_t) zSig ) * zSig;
2157
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2158
        while ( (int64_t) rem < 0 ) {
2159
            --zSig;
2160
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2161
        }
2162
        zSig |= ( rem != 0 );
2163
    }
2164
    shift32RightJamming( zSig, 1, &zSig );
2165
 roundAndPack:
2166
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2167

    
2168
}
2169

    
2170
/*----------------------------------------------------------------------------
2171
| Returns the binary exponential of the single-precision floating-point value
2172
| `a'. The operation is performed according to the IEC/IEEE Standard for
2173
| Binary Floating-Point Arithmetic.
2174
|
2175
| Uses the following identities:
2176
|
2177
| 1. -------------------------------------------------------------------------
2178
|      x    x*ln(2)
2179
|     2  = e
2180
|
2181
| 2. -------------------------------------------------------------------------
2182
|                      2     3     4     5           n
2183
|      x        x     x     x     x     x           x
2184
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2185
|               1!    2!    3!    4!    5!          n!
2186
*----------------------------------------------------------------------------*/
2187

    
2188
static const float64 float32_exp2_coefficients[15] =
2189
{
2190
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2191
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2192
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2193
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2194
    const_float64( 0x3f81111111111111ll ), /*  5 */
2195
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2196
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2197
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2198
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2199
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2200
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2201
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2202
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2203
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2204
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2205
};
2206

    
2207
float32 float32_exp2( float32 a STATUS_PARAM )
2208
{
2209
    flag aSign;
2210
    int16 aExp;
2211
    uint32_t aSig;
2212
    float64 r, x, xn;
2213
    int i;
2214
    a = float32_squash_input_denormal(a STATUS_VAR);
2215

    
2216
    aSig = extractFloat32Frac( a );
2217
    aExp = extractFloat32Exp( a );
2218
    aSign = extractFloat32Sign( a );
2219

    
2220
    if ( aExp == 0xFF) {
2221
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2222
        return (aSign) ? float32_zero : a;
2223
    }
2224
    if (aExp == 0) {
2225
        if (aSig == 0) return float32_one;
2226
    }
2227

    
2228
    float_raise( float_flag_inexact STATUS_VAR);
2229

    
2230
    /* ******************************* */
2231
    /* using float64 for approximation */
2232
    /* ******************************* */
2233
    x = float32_to_float64(a STATUS_VAR);
2234
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2235

    
2236
    xn = x;
2237
    r = float64_one;
2238
    for (i = 0 ; i < 15 ; i++) {
2239
        float64 f;
2240

    
2241
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2242
        r = float64_add(r, f STATUS_VAR);
2243

    
2244
        xn = float64_mul(xn, x STATUS_VAR);
2245
    }
2246

    
2247
    return float64_to_float32(r, status);
2248
}
2249

    
2250
/*----------------------------------------------------------------------------
2251
| Returns the binary log of the single-precision floating-point value `a'.
2252
| The operation is performed according to the IEC/IEEE Standard for Binary
2253
| Floating-Point Arithmetic.
2254
*----------------------------------------------------------------------------*/
2255
float32 float32_log2( float32 a STATUS_PARAM )
2256
{
2257
    flag aSign, zSign;
2258
    int16 aExp;
2259
    uint32_t aSig, zSig, i;
2260

    
2261
    a = float32_squash_input_denormal(a STATUS_VAR);
2262
    aSig = extractFloat32Frac( a );
2263
    aExp = extractFloat32Exp( a );
2264
    aSign = extractFloat32Sign( a );
2265

    
2266
    if ( aExp == 0 ) {
2267
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2268
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2269
    }
2270
    if ( aSign ) {
2271
        float_raise( float_flag_invalid STATUS_VAR);
2272
        return float32_default_nan;
2273
    }
2274
    if ( aExp == 0xFF ) {
2275
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2276
        return a;
2277
    }
2278

    
2279
    aExp -= 0x7F;
2280
    aSig |= 0x00800000;
2281
    zSign = aExp < 0;
2282
    zSig = aExp << 23;
2283

    
2284
    for (i = 1 << 22; i > 0; i >>= 1) {
2285
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2286
        if ( aSig & 0x01000000 ) {
2287
            aSig >>= 1;
2288
            zSig |= i;
2289
        }
2290
    }
2291

    
2292
    if ( zSign )
2293
        zSig = -zSig;
2294

    
2295
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2296
}
2297

    
2298
/*----------------------------------------------------------------------------
2299
| Returns 1 if the single-precision floating-point value `a' is equal to
2300
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2301
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2302
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303
*----------------------------------------------------------------------------*/
2304

    
2305
int float32_eq( float32 a, float32 b STATUS_PARAM )
2306
{
2307
    uint32_t av, bv;
2308
    a = float32_squash_input_denormal(a STATUS_VAR);
2309
    b = float32_squash_input_denormal(b STATUS_VAR);
2310

    
2311
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2312
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2313
       ) {
2314
        float_raise( float_flag_invalid STATUS_VAR);
2315
        return 0;
2316
    }
2317
    av = float32_val(a);
2318
    bv = float32_val(b);
2319
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2320
}
2321

    
2322
/*----------------------------------------------------------------------------
2323
| Returns 1 if the single-precision floating-point value `a' is less than
2324
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2325
| exception is raised if either operand is a NaN.  The comparison is performed
2326
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2327
*----------------------------------------------------------------------------*/
2328

    
2329
int float32_le( float32 a, float32 b STATUS_PARAM )
2330
{
2331
    flag aSign, bSign;
2332
    uint32_t av, bv;
2333
    a = float32_squash_input_denormal(a STATUS_VAR);
2334
    b = float32_squash_input_denormal(b STATUS_VAR);
2335

    
2336
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2337
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2338
       ) {
2339
        float_raise( float_flag_invalid STATUS_VAR);
2340
        return 0;
2341
    }
2342
    aSign = extractFloat32Sign( a );
2343
    bSign = extractFloat32Sign( b );
2344
    av = float32_val(a);
2345
    bv = float32_val(b);
2346
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2347
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2348

    
2349
}
2350

    
2351
/*----------------------------------------------------------------------------
2352
| Returns 1 if the single-precision floating-point value `a' is less than
2353
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2354
| raised if either operand is a NaN.  The comparison is performed according
2355
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2356
*----------------------------------------------------------------------------*/
2357

    
2358
int float32_lt( float32 a, float32 b STATUS_PARAM )
2359
{
2360
    flag aSign, bSign;
2361
    uint32_t av, bv;
2362
    a = float32_squash_input_denormal(a STATUS_VAR);
2363
    b = float32_squash_input_denormal(b STATUS_VAR);
2364

    
2365
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2366
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2367
       ) {
2368
        float_raise( float_flag_invalid STATUS_VAR);
2369
        return 0;
2370
    }
2371
    aSign = extractFloat32Sign( a );
2372
    bSign = extractFloat32Sign( b );
2373
    av = float32_val(a);
2374
    bv = float32_val(b);
2375
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2376
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2377

    
2378
}
2379

    
2380
/*----------------------------------------------------------------------------
2381
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2382
| be compared, and 0 otherwise.  The invalid exception is raised if either
2383
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2384
| Standard for Binary Floating-Point Arithmetic.
2385
*----------------------------------------------------------------------------*/
2386

    
2387
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2388
{
2389
    a = float32_squash_input_denormal(a STATUS_VAR);
2390
    b = float32_squash_input_denormal(b STATUS_VAR);
2391

    
2392
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2393
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2394
       ) {
2395
        float_raise( float_flag_invalid STATUS_VAR);
2396
        return 1;
2397
    }
2398
    return 0;
2399
}
2400

    
2401
/*----------------------------------------------------------------------------
2402
| Returns 1 if the single-precision floating-point value `a' is equal to
2403
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2404
| exception.  The comparison is performed according to the IEC/IEEE Standard
2405
| for Binary Floating-Point Arithmetic.
2406
*----------------------------------------------------------------------------*/
2407

    
2408
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2409
{
2410
    a = float32_squash_input_denormal(a STATUS_VAR);
2411
    b = float32_squash_input_denormal(b STATUS_VAR);
2412

    
2413
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2414
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2415
       ) {
2416
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2417
            float_raise( float_flag_invalid STATUS_VAR);
2418
        }
2419
        return 0;
2420
    }
2421
    return ( float32_val(a) == float32_val(b) ) ||
2422
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2423
}
2424

    
2425
/*----------------------------------------------------------------------------
2426
| Returns 1 if the single-precision floating-point value `a' is less than or
2427
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2428
| cause an exception.  Otherwise, the comparison is performed according to the
2429
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2430
*----------------------------------------------------------------------------*/
2431

    
2432
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2433
{
2434
    flag aSign, bSign;
2435
    uint32_t av, bv;
2436
    a = float32_squash_input_denormal(a STATUS_VAR);
2437
    b = float32_squash_input_denormal(b STATUS_VAR);
2438

    
2439
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2440
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2441
       ) {
2442
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2443
            float_raise( float_flag_invalid STATUS_VAR);
2444
        }
2445
        return 0;
2446
    }
2447
    aSign = extractFloat32Sign( a );
2448
    bSign = extractFloat32Sign( b );
2449
    av = float32_val(a);
2450
    bv = float32_val(b);
2451
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2452
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2453

    
2454
}
2455

    
2456
/*----------------------------------------------------------------------------
2457
| Returns 1 if the single-precision floating-point value `a' is less than
2458
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2459
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2460
| Standard for Binary Floating-Point Arithmetic.
2461
*----------------------------------------------------------------------------*/
2462

    
2463
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2464
{
2465
    flag aSign, bSign;
2466
    uint32_t av, bv;
2467
    a = float32_squash_input_denormal(a STATUS_VAR);
2468
    b = float32_squash_input_denormal(b STATUS_VAR);
2469

    
2470
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2471
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2472
       ) {
2473
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2474
            float_raise( float_flag_invalid STATUS_VAR);
2475
        }
2476
        return 0;
2477
    }
2478
    aSign = extractFloat32Sign( a );
2479
    bSign = extractFloat32Sign( b );
2480
    av = float32_val(a);
2481
    bv = float32_val(b);
2482
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2483
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2484

    
2485
}
2486

    
2487
/*----------------------------------------------------------------------------
2488
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2489
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2490
| comparison is performed according to the IEC/IEEE Standard for Binary
2491
| Floating-Point Arithmetic.
2492
*----------------------------------------------------------------------------*/
2493

    
2494
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2495
{
2496
    a = float32_squash_input_denormal(a STATUS_VAR);
2497
    b = float32_squash_input_denormal(b STATUS_VAR);
2498

    
2499
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2500
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2501
       ) {
2502
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2503
            float_raise( float_flag_invalid STATUS_VAR);
2504
        }
2505
        return 1;
2506
    }
2507
    return 0;
2508
}
2509

    
2510
/*----------------------------------------------------------------------------
2511
| Returns the result of converting the double-precision floating-point value
2512
| `a' to the 32-bit two's complement integer format.  The conversion is
2513
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2514
| Arithmetic---which means in particular that the conversion is rounded
2515
| according to the current rounding mode.  If `a' is a NaN, the largest
2516
| positive integer is returned.  Otherwise, if the conversion overflows, the
2517
| largest integer with the same sign as `a' is returned.
2518
*----------------------------------------------------------------------------*/
2519

    
2520
int32 float64_to_int32( float64 a STATUS_PARAM )
2521
{
2522
    flag aSign;
2523
    int16 aExp, shiftCount;
2524
    uint64_t aSig;
2525
    a = float64_squash_input_denormal(a STATUS_VAR);
2526

    
2527
    aSig = extractFloat64Frac( a );
2528
    aExp = extractFloat64Exp( a );
2529
    aSign = extractFloat64Sign( a );
2530
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2531
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2532
    shiftCount = 0x42C - aExp;
2533
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2534
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2535

    
2536
}
2537

    
2538
/*----------------------------------------------------------------------------
2539
| Returns the result of converting the double-precision floating-point value
2540
| `a' to the 32-bit two's complement integer format.  The conversion is
2541
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2542
| Arithmetic, except that the conversion is always rounded toward zero.
2543
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2544
| the conversion overflows, the largest integer with the same sign as `a' is
2545
| returned.
2546
*----------------------------------------------------------------------------*/
2547

    
2548
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2549
{
2550
    flag aSign;
2551
    int16 aExp, shiftCount;
2552
    uint64_t aSig, savedASig;
2553
    int32 z;
2554
    a = float64_squash_input_denormal(a STATUS_VAR);
2555

    
2556
    aSig = extractFloat64Frac( a );
2557
    aExp = extractFloat64Exp( a );
2558
    aSign = extractFloat64Sign( a );
2559
    if ( 0x41E < aExp ) {
2560
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2561
        goto invalid;
2562
    }
2563
    else if ( aExp < 0x3FF ) {
2564
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2565
        return 0;
2566
    }
2567
    aSig |= LIT64( 0x0010000000000000 );
2568
    shiftCount = 0x433 - aExp;
2569
    savedASig = aSig;
2570
    aSig >>= shiftCount;
2571
    z = aSig;
2572
    if ( aSign ) z = - z;
2573
    if ( ( z < 0 ) ^ aSign ) {
2574
 invalid:
2575
        float_raise( float_flag_invalid STATUS_VAR);
2576
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2577
    }
2578
    if ( ( aSig<<shiftCount ) != savedASig ) {
2579
        STATUS(float_exception_flags) |= float_flag_inexact;
2580
    }
2581
    return z;
2582

    
2583
}
2584

    
2585
/*----------------------------------------------------------------------------
2586
| Returns the result of converting the double-precision floating-point value
2587
| `a' to the 16-bit two's complement integer format.  The conversion is
2588
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2589
| Arithmetic, except that the conversion is always rounded toward zero.
2590
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2591
| the conversion overflows, the largest integer with the same sign as `a' is
2592
| returned.
2593
*----------------------------------------------------------------------------*/
2594

    
2595
int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2596
{
2597
    flag aSign;
2598
    int16 aExp, shiftCount;
2599
    uint64_t aSig, savedASig;
2600
    int32 z;
2601

    
2602
    aSig = extractFloat64Frac( a );
2603
    aExp = extractFloat64Exp( a );
2604
    aSign = extractFloat64Sign( a );
2605
    if ( 0x40E < aExp ) {
2606
        if ( ( aExp == 0x7FF ) && aSig ) {
2607
            aSign = 0;
2608
        }
2609
        goto invalid;
2610
    }
2611
    else if ( aExp < 0x3FF ) {
2612
        if ( aExp || aSig ) {
2613
            STATUS(float_exception_flags) |= float_flag_inexact;
2614
        }
2615
        return 0;
2616
    }
2617
    aSig |= LIT64( 0x0010000000000000 );
2618
    shiftCount = 0x433 - aExp;
2619
    savedASig = aSig;
2620
    aSig >>= shiftCount;
2621
    z = aSig;
2622
    if ( aSign ) {
2623
        z = - z;
2624
    }
2625
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2626
 invalid:
2627
        float_raise( float_flag_invalid STATUS_VAR);
2628
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2629
    }
2630
    if ( ( aSig<<shiftCount ) != savedASig ) {
2631
        STATUS(float_exception_flags) |= float_flag_inexact;
2632
    }
2633
    return z;
2634
}
2635

    
2636
/*----------------------------------------------------------------------------
2637
| Returns the result of converting the double-precision floating-point value
2638
| `a' to the 64-bit two's complement integer format.  The conversion is
2639
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2640
| Arithmetic---which means in particular that the conversion is rounded
2641
| according to the current rounding mode.  If `a' is a NaN, the largest
2642
| positive integer is returned.  Otherwise, if the conversion overflows, the
2643
| largest integer with the same sign as `a' is returned.
2644
*----------------------------------------------------------------------------*/
2645

    
2646
int64 float64_to_int64( float64 a STATUS_PARAM )
2647
{
2648
    flag aSign;
2649
    int16 aExp, shiftCount;
2650
    uint64_t aSig, aSigExtra;
2651
    a = float64_squash_input_denormal(a STATUS_VAR);
2652

    
2653
    aSig = extractFloat64Frac( a );
2654
    aExp = extractFloat64Exp( a );
2655
    aSign = extractFloat64Sign( a );
2656
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2657
    shiftCount = 0x433 - aExp;
2658
    if ( shiftCount <= 0 ) {
2659
        if ( 0x43E < aExp ) {
2660
            float_raise( float_flag_invalid STATUS_VAR);
2661
            if (    ! aSign
2662
                 || (    ( aExp == 0x7FF )
2663
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2664
               ) {
2665
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2666
            }
2667
            return (int64_t) LIT64( 0x8000000000000000 );
2668
        }
2669
        aSigExtra = 0;
2670
        aSig <<= - shiftCount;
2671
    }
2672
    else {
2673
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2674
    }
2675
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2676

    
2677
}
2678

    
2679
/*----------------------------------------------------------------------------
2680
| Returns the result of converting the double-precision floating-point value
2681
| `a' to the 64-bit two's complement integer format.  The conversion is
2682
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2683
| Arithmetic, except that the conversion is always rounded toward zero.
2684
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2685
| the conversion overflows, the largest integer with the same sign as `a' is
2686
| returned.
2687
*----------------------------------------------------------------------------*/
2688

    
2689
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2690
{
2691
    flag aSign;
2692
    int16 aExp, shiftCount;
2693
    uint64_t aSig;
2694
    int64 z;
2695
    a = float64_squash_input_denormal(a STATUS_VAR);
2696

    
2697
    aSig = extractFloat64Frac( a );
2698
    aExp = extractFloat64Exp( a );
2699
    aSign = extractFloat64Sign( a );
2700
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2701
    shiftCount = aExp - 0x433;
2702
    if ( 0 <= shiftCount ) {
2703
        if ( 0x43E <= aExp ) {
2704
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2705
                float_raise( float_flag_invalid STATUS_VAR);
2706
                if (    ! aSign
2707
                     || (    ( aExp == 0x7FF )
2708
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2709
                   ) {
2710
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2711
                }
2712
            }
2713
            return (int64_t) LIT64( 0x8000000000000000 );
2714
        }
2715
        z = aSig<<shiftCount;
2716
    }
2717
    else {
2718
        if ( aExp < 0x3FE ) {
2719
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2720
            return 0;
2721
        }
2722
        z = aSig>>( - shiftCount );
2723
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2724
            STATUS(float_exception_flags) |= float_flag_inexact;
2725
        }
2726
    }
2727
    if ( aSign ) z = - z;
2728
    return z;
2729

    
2730
}
2731

    
2732
/*----------------------------------------------------------------------------
2733
| Returns the result of converting the double-precision floating-point value
2734
| `a' to the single-precision floating-point format.  The conversion is
2735
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2736
| Arithmetic.
2737
*----------------------------------------------------------------------------*/
2738

    
2739
float32 float64_to_float32( float64 a STATUS_PARAM )
2740
{
2741
    flag aSign;
2742
    int16 aExp;
2743
    uint64_t aSig;
2744
    uint32_t zSig;
2745
    a = float64_squash_input_denormal(a STATUS_VAR);
2746

    
2747
    aSig = extractFloat64Frac( a );
2748
    aExp = extractFloat64Exp( a );
2749
    aSign = extractFloat64Sign( a );
2750
    if ( aExp == 0x7FF ) {
2751
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2752
        return packFloat32( aSign, 0xFF, 0 );
2753
    }
2754
    shift64RightJamming( aSig, 22, &aSig );
2755
    zSig = aSig;
2756
    if ( aExp || zSig ) {
2757
        zSig |= 0x40000000;
2758
        aExp -= 0x381;
2759
    }
2760
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2761

    
2762
}
2763

    
2764

    
2765
/*----------------------------------------------------------------------------
2766
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2767
| half-precision floating-point value, returning the result.  After being
2768
| shifted into the proper positions, the three fields are simply added
2769
| together to form the result.  This means that any integer portion of `zSig'
2770
| will be added into the exponent.  Since a properly normalized significand
2771
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2772
| than the desired result exponent whenever `zSig' is a complete, normalized
2773
| significand.
2774
*----------------------------------------------------------------------------*/
2775
static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2776
{
2777
    return make_float16(
2778
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2779
}
2780

    
2781
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2782
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2783

    
2784
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2785
{
2786
    flag aSign;
2787
    int16 aExp;
2788
    uint32_t aSig;
2789

    
2790
    aSign = extractFloat16Sign(a);
2791
    aExp = extractFloat16Exp(a);
2792
    aSig = extractFloat16Frac(a);
2793

    
2794
    if (aExp == 0x1f && ieee) {
2795
        if (aSig) {
2796
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2797
        }
2798
        return packFloat32(aSign, 0xff, aSig << 13);
2799
    }
2800
    if (aExp == 0) {
2801
        int8 shiftCount;
2802

    
2803
        if (aSig == 0) {
2804
            return packFloat32(aSign, 0, 0);
2805
        }
2806

    
2807
        shiftCount = countLeadingZeros32( aSig ) - 21;
2808
        aSig = aSig << shiftCount;
2809
        aExp = -shiftCount;
2810
    }
2811
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2812
}
2813

    
2814
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2815
{
2816
    flag aSign;
2817
    int16 aExp;
2818
    uint32_t aSig;
2819
    uint32_t mask;
2820
    uint32_t increment;
2821
    int8 roundingMode;
2822
    a = float32_squash_input_denormal(a STATUS_VAR);
2823

    
2824
    aSig = extractFloat32Frac( a );
2825
    aExp = extractFloat32Exp( a );
2826
    aSign = extractFloat32Sign( a );
2827
    if ( aExp == 0xFF ) {
2828
        if (aSig) {
2829
            /* Input is a NaN */
2830
            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2831
            if (!ieee) {
2832
                return packFloat16(aSign, 0, 0);
2833
            }
2834
            return r;
2835
        }
2836
        /* Infinity */
2837
        if (!ieee) {
2838
            float_raise(float_flag_invalid STATUS_VAR);
2839
            return packFloat16(aSign, 0x1f, 0x3ff);
2840
        }
2841
        return packFloat16(aSign, 0x1f, 0);
2842
    }
2843
    if (aExp == 0 && aSig == 0) {
2844
        return packFloat16(aSign, 0, 0);
2845
    }
2846
    /* Decimal point between bits 22 and 23.  */
2847
    aSig |= 0x00800000;
2848
    aExp -= 0x7f;
2849
    if (aExp < -14) {
2850
        mask = 0x00ffffff;
2851
        if (aExp >= -24) {
2852
            mask >>= 25 + aExp;
2853
        }
2854
    } else {
2855
        mask = 0x00001fff;
2856
    }
2857
    if (aSig & mask) {
2858
        float_raise( float_flag_underflow STATUS_VAR );
2859
        roundingMode = STATUS(float_rounding_mode);
2860
        switch (roundingMode) {
2861
        case float_round_nearest_even:
2862
            increment = (mask + 1) >> 1;
2863
            if ((aSig & mask) == increment) {
2864
                increment = aSig & (increment << 1);
2865
            }
2866
            break;
2867
        case float_round_up:
2868
            increment = aSign ? 0 : mask;
2869
            break;
2870
        case float_round_down:
2871
            increment = aSign ? mask : 0;
2872
            break;
2873
        default: /* round_to_zero */
2874
            increment = 0;
2875
            break;
2876
        }
2877
        aSig += increment;
2878
        if (aSig >= 0x01000000) {
2879
            aSig >>= 1;
2880
            aExp++;
2881
        }
2882
    } else if (aExp < -14
2883
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2884
        float_raise( float_flag_underflow STATUS_VAR);
2885
    }
2886

    
2887
    if (ieee) {
2888
        if (aExp > 15) {
2889
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2890
            return packFloat16(aSign, 0x1f, 0);
2891
        }
2892
    } else {
2893
        if (aExp > 16) {
2894
            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2895
            return packFloat16(aSign, 0x1f, 0x3ff);
2896
        }
2897
    }
2898
    if (aExp < -24) {
2899
        return packFloat16(aSign, 0, 0);
2900
    }
2901
    if (aExp < -14) {
2902
        aSig >>= -14 - aExp;
2903
        aExp = -14;
2904
    }
2905
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2906
}
2907

    
2908
/*----------------------------------------------------------------------------
2909
| Returns the result of converting the double-precision floating-point value
2910
| `a' to the extended double-precision floating-point format.  The conversion
2911
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2912
| Arithmetic.
2913
*----------------------------------------------------------------------------*/
2914

    
2915
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2916
{
2917
    flag aSign;
2918
    int16 aExp;
2919
    uint64_t aSig;
2920

    
2921
    a = float64_squash_input_denormal(a STATUS_VAR);
2922
    aSig = extractFloat64Frac( a );
2923
    aExp = extractFloat64Exp( a );
2924
    aSign = extractFloat64Sign( a );
2925
    if ( aExp == 0x7FF ) {
2926
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2927
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2928
    }
2929
    if ( aExp == 0 ) {
2930
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2931
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2932
    }
2933
    return
2934
        packFloatx80(
2935
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2936

    
2937
}
2938

    
2939
/*----------------------------------------------------------------------------
2940
| Returns the result of converting the double-precision floating-point value
2941
| `a' to the quadruple-precision floating-point format.  The conversion is
2942
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2943
| Arithmetic.
2944
*----------------------------------------------------------------------------*/
2945

    
2946
float128 float64_to_float128( float64 a STATUS_PARAM )
2947
{
2948
    flag aSign;
2949
    int16 aExp;
2950
    uint64_t aSig, zSig0, zSig1;
2951

    
2952
    a = float64_squash_input_denormal(a STATUS_VAR);
2953
    aSig = extractFloat64Frac( a );
2954
    aExp = extractFloat64Exp( a );
2955
    aSign = extractFloat64Sign( a );
2956
    if ( aExp == 0x7FF ) {
2957
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2958
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2959
    }
2960
    if ( aExp == 0 ) {
2961
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2962
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2963
        --aExp;
2964
    }
2965
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2966
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2967

    
2968
}
2969

    
2970
/*----------------------------------------------------------------------------
2971
| Rounds the double-precision floating-point value `a' to an integer, and
2972
| returns the result as a double-precision floating-point value.  The
2973
| operation is performed according to the IEC/IEEE Standard for Binary
2974
| Floating-Point Arithmetic.
2975
*----------------------------------------------------------------------------*/
2976

    
2977
float64 float64_round_to_int( float64 a STATUS_PARAM )
2978
{
2979
    flag aSign;
2980
    int16 aExp;
2981
    uint64_t lastBitMask, roundBitsMask;
2982
    int8 roundingMode;
2983
    uint64_t z;
2984
    a = float64_squash_input_denormal(a STATUS_VAR);
2985

    
2986
    aExp = extractFloat64Exp( a );
2987
    if ( 0x433 <= aExp ) {
2988
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2989
            return propagateFloat64NaN( a, a STATUS_VAR );
2990
        }
2991
        return a;
2992
    }
2993
    if ( aExp < 0x3FF ) {
2994
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
2995
        STATUS(float_exception_flags) |= float_flag_inexact;
2996
        aSign = extractFloat64Sign( a );
2997
        switch ( STATUS(float_rounding_mode) ) {
2998
         case float_round_nearest_even:
2999
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3000
                return packFloat64( aSign, 0x3FF, 0 );
3001
            }
3002
            break;
3003
         case float_round_down:
3004
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3005
         case float_round_up:
3006
            return make_float64(
3007
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3008
        }
3009
        return packFloat64( aSign, 0, 0 );
3010
    }
3011
    lastBitMask = 1;
3012
    lastBitMask <<= 0x433 - aExp;
3013
    roundBitsMask = lastBitMask - 1;
3014
    z = float64_val(a);
3015
    roundingMode = STATUS(float_rounding_mode);
3016
    if ( roundingMode == float_round_nearest_even ) {
3017
        z += lastBitMask>>1;
3018
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3019
    }
3020
    else if ( roundingMode != float_round_to_zero ) {
3021
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3022
            z += roundBitsMask;
3023
        }
3024
    }
3025
    z &= ~ roundBitsMask;
3026
    if ( z != float64_val(a) )
3027
        STATUS(float_exception_flags) |= float_flag_inexact;
3028
    return make_float64(z);
3029

    
3030
}
3031

    
3032
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3033
{
3034
    int oldmode;
3035
    float64 res;
3036
    oldmode = STATUS(float_rounding_mode);
3037
    STATUS(float_rounding_mode) = float_round_to_zero;
3038
    res = float64_round_to_int(a STATUS_VAR);
3039
    STATUS(float_rounding_mode) = oldmode;
3040
    return res;
3041
}
3042

    
3043
/*----------------------------------------------------------------------------
3044
| Returns the result of adding the absolute values of the double-precision
3045
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3046
| before being returned.  `zSign' is ignored if the result is a NaN.
3047
| The addition is performed according to the IEC/IEEE Standard for Binary
3048
| Floating-Point Arithmetic.
3049
*----------------------------------------------------------------------------*/
3050

    
3051
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3052
{
3053
    int16 aExp, bExp, zExp;
3054
    uint64_t aSig, bSig, zSig;
3055
    int16 expDiff;
3056

    
3057
    aSig = extractFloat64Frac( a );
3058
    aExp = extractFloat64Exp( a );
3059
    bSig = extractFloat64Frac( b );
3060
    bExp = extractFloat64Exp( b );
3061
    expDiff = aExp - bExp;
3062
    aSig <<= 9;
3063
    bSig <<= 9;
3064
    if ( 0 < expDiff ) {
3065
        if ( aExp == 0x7FF ) {
3066
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3067
            return a;
3068
        }
3069
        if ( bExp == 0 ) {
3070
            --expDiff;
3071
        }
3072
        else {
3073
            bSig |= LIT64( 0x2000000000000000 );
3074
        }
3075
        shift64RightJamming( bSig, expDiff, &bSig );
3076
        zExp = aExp;
3077
    }
3078
    else if ( expDiff < 0 ) {
3079
        if ( bExp == 0x7FF ) {
3080
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3081
            return packFloat64( zSign, 0x7FF, 0 );
3082
        }
3083
        if ( aExp == 0 ) {
3084
            ++expDiff;
3085
        }
3086
        else {
3087
            aSig |= LIT64( 0x2000000000000000 );
3088
        }
3089
        shift64RightJamming( aSig, - expDiff, &aSig );
3090
        zExp = bExp;
3091
    }
3092
    else {
3093
        if ( aExp == 0x7FF ) {
3094
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3095
            return a;
3096
        }
3097
        if ( aExp == 0 ) {
3098
            if (STATUS(flush_to_zero)) {
3099
                if (aSig | bSig) {
3100
                    float_raise(float_flag_output_denormal STATUS_VAR);
3101
                }
3102
                return packFloat64(zSign, 0, 0);
3103
            }
3104
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3105
        }
3106
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3107
        zExp = aExp;
3108
        goto roundAndPack;
3109
    }
3110
    aSig |= LIT64( 0x2000000000000000 );
3111
    zSig = ( aSig + bSig )<<1;
3112
    --zExp;
3113
    if ( (int64_t) zSig < 0 ) {
3114
        zSig = aSig + bSig;
3115
        ++zExp;
3116
    }
3117
 roundAndPack:
3118
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3119

    
3120
}
3121

    
3122
/*----------------------------------------------------------------------------
3123
| Returns the result of subtracting the absolute values of the double-
3124
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3125
| difference is negated before being returned.  `zSign' is ignored if the
3126
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3127
| Standard for Binary Floating-Point Arithmetic.
3128
*----------------------------------------------------------------------------*/
3129

    
3130
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3131
{
3132
    int16 aExp, bExp, zExp;
3133
    uint64_t aSig, bSig, zSig;
3134
    int16 expDiff;
3135

    
3136
    aSig = extractFloat64Frac( a );
3137
    aExp = extractFloat64Exp( a );
3138
    bSig = extractFloat64Frac( b );
3139
    bExp = extractFloat64Exp( b );
3140
    expDiff = aExp - bExp;
3141
    aSig <<= 10;
3142
    bSig <<= 10;
3143
    if ( 0 < expDiff ) goto aExpBigger;
3144
    if ( expDiff < 0 ) goto bExpBigger;
3145
    if ( aExp == 0x7FF ) {
3146
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3147
        float_raise( float_flag_invalid STATUS_VAR);
3148
        return float64_default_nan;
3149
    }
3150
    if ( aExp == 0 ) {
3151
        aExp = 1;
3152
        bExp = 1;
3153
    }
3154
    if ( bSig < aSig ) goto aBigger;
3155
    if ( aSig < bSig ) goto bBigger;
3156
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3157
 bExpBigger:
3158
    if ( bExp == 0x7FF ) {
3159
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3160
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3161
    }
3162
    if ( aExp == 0 ) {
3163
        ++expDiff;
3164
    }
3165
    else {
3166
        aSig |= LIT64( 0x4000000000000000 );
3167
    }
3168
    shift64RightJamming( aSig, - expDiff, &aSig );
3169
    bSig |= LIT64( 0x4000000000000000 );
3170
 bBigger:
3171
    zSig = bSig - aSig;
3172
    zExp = bExp;
3173
    zSign ^= 1;
3174
    goto normalizeRoundAndPack;
3175
 aExpBigger:
3176
    if ( aExp == 0x7FF ) {
3177
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178
        return a;
3179
    }
3180
    if ( bExp == 0 ) {
3181
        --expDiff;
3182
    }
3183
    else {
3184
        bSig |= LIT64( 0x4000000000000000 );
3185
    }
3186
    shift64RightJamming( bSig, expDiff, &bSig );
3187
    aSig |= LIT64( 0x4000000000000000 );
3188
 aBigger:
3189
    zSig = aSig - bSig;
3190
    zExp = aExp;
3191
 normalizeRoundAndPack:
3192
    --zExp;
3193
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3194

    
3195
}
3196

    
3197
/*----------------------------------------------------------------------------
3198
| Returns the result of adding the double-precision floating-point values `a'
3199
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3200
| Binary Floating-Point Arithmetic.
3201
*----------------------------------------------------------------------------*/
3202

    
3203
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3204
{
3205
    flag aSign, bSign;
3206
    a = float64_squash_input_denormal(a STATUS_VAR);
3207
    b = float64_squash_input_denormal(b STATUS_VAR);
3208

    
3209
    aSign = extractFloat64Sign( a );
3210
    bSign = extractFloat64Sign( b );
3211
    if ( aSign == bSign ) {
3212
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3213
    }
3214
    else {
3215
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3216
    }
3217

    
3218
}
3219

    
3220
/*----------------------------------------------------------------------------
3221
| Returns the result of subtracting the double-precision floating-point values
3222
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3223
| for Binary Floating-Point Arithmetic.
3224
*----------------------------------------------------------------------------*/
3225

    
3226
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3227
{
3228
    flag aSign, bSign;
3229
    a = float64_squash_input_denormal(a STATUS_VAR);
3230
    b = float64_squash_input_denormal(b STATUS_VAR);
3231

    
3232
    aSign = extractFloat64Sign( a );
3233
    bSign = extractFloat64Sign( b );
3234
    if ( aSign == bSign ) {
3235
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3236
    }
3237
    else {
3238
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3239
    }
3240

    
3241
}
3242

    
3243
/*----------------------------------------------------------------------------
3244
| Returns the result of multiplying the double-precision floating-point values
3245
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3246
| for Binary Floating-Point Arithmetic.
3247
*----------------------------------------------------------------------------*/
3248

    
3249
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3250
{
3251
    flag aSign, bSign, zSign;
3252
    int16 aExp, bExp, zExp;
3253
    uint64_t aSig, bSig, zSig0, zSig1;
3254

    
3255
    a = float64_squash_input_denormal(a STATUS_VAR);
3256
    b = float64_squash_input_denormal(b STATUS_VAR);
3257

    
3258
    aSig = extractFloat64Frac( a );
3259
    aExp = extractFloat64Exp( a );
3260
    aSign = extractFloat64Sign( a );
3261
    bSig = extractFloat64Frac( b );
3262
    bExp = extractFloat64Exp( b );
3263
    bSign = extractFloat64Sign( b );
3264
    zSign = aSign ^ bSign;
3265
    if ( aExp == 0x7FF ) {
3266
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3267
            return propagateFloat64NaN( a, b STATUS_VAR );
3268
        }
3269
        if ( ( bExp | bSig ) == 0 ) {
3270
            float_raise( float_flag_invalid STATUS_VAR);
3271
            return float64_default_nan;
3272
        }
3273
        return packFloat64( zSign, 0x7FF, 0 );
3274
    }
3275
    if ( bExp == 0x7FF ) {
3276
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277
        if ( ( aExp | aSig ) == 0 ) {
3278
            float_raise( float_flag_invalid STATUS_VAR);
3279
            return float64_default_nan;
3280
        }
3281
        return packFloat64( zSign, 0x7FF, 0 );
3282
    }
3283
    if ( aExp == 0 ) {
3284
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3285
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3286
    }
3287
    if ( bExp == 0 ) {
3288
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3289
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3290
    }
3291
    zExp = aExp + bExp - 0x3FF;
3292
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3293
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3294
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3295
    zSig0 |= ( zSig1 != 0 );
3296
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3297
        zSig0 <<= 1;
3298
        --zExp;
3299
    }
3300
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3301

    
3302
}
3303

    
3304
/*----------------------------------------------------------------------------
3305
| Returns the result of dividing the double-precision floating-point value `a'
3306
| by the corresponding value `b'.  The operation is performed according to
3307
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3308
*----------------------------------------------------------------------------*/
3309

    
3310
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3311
{
3312
    flag aSign, bSign, zSign;
3313
    int16 aExp, bExp, zExp;
3314
    uint64_t aSig, bSig, zSig;
3315
    uint64_t rem0, rem1;
3316
    uint64_t term0, term1;
3317
    a = float64_squash_input_denormal(a STATUS_VAR);
3318
    b = float64_squash_input_denormal(b STATUS_VAR);
3319

    
3320
    aSig = extractFloat64Frac( a );
3321
    aExp = extractFloat64Exp( a );
3322
    aSign = extractFloat64Sign( a );
3323
    bSig = extractFloat64Frac( b );
3324
    bExp = extractFloat64Exp( b );
3325
    bSign = extractFloat64Sign( b );
3326
    zSign = aSign ^ bSign;
3327
    if ( aExp == 0x7FF ) {
3328
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3329
        if ( bExp == 0x7FF ) {
3330
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3331
            float_raise( float_flag_invalid STATUS_VAR);
3332
            return float64_default_nan;
3333
        }
3334
        return packFloat64( zSign, 0x7FF, 0 );
3335
    }
3336
    if ( bExp == 0x7FF ) {
3337
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3338
        return packFloat64( zSign, 0, 0 );
3339
    }
3340
    if ( bExp == 0 ) {
3341
        if ( bSig == 0 ) {
3342
            if ( ( aExp | aSig ) == 0 ) {
3343
                float_raise( float_flag_invalid STATUS_VAR);
3344
                return float64_default_nan;
3345
            }
3346
            float_raise( float_flag_divbyzero STATUS_VAR);
3347
            return packFloat64( zSign, 0x7FF, 0 );
3348
        }
3349
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3350
    }
3351
    if ( aExp == 0 ) {
3352
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3353
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3354
    }
3355
    zExp = aExp - bExp + 0x3FD;
3356
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3357
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3358
    if ( bSig <= ( aSig + aSig ) ) {
3359
        aSig >>= 1;
3360
        ++zExp;
3361
    }
3362
    zSig = estimateDiv128To64( aSig, 0, bSig );
3363
    if ( ( zSig & 0x1FF ) <= 2 ) {
3364
        mul64To128( bSig, zSig, &term0, &term1 );
3365
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3366
        while ( (int64_t) rem0 < 0 ) {
3367
            --zSig;
3368
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3369
        }
3370
        zSig |= ( rem1 != 0 );
3371
    }
3372
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3373

    
3374
}
3375

    
3376
/*----------------------------------------------------------------------------
3377
| Returns the remainder of the double-precision floating-point value `a'
3378
| with respect to the corresponding value `b'.  The operation is performed
3379
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3380
*----------------------------------------------------------------------------*/
3381

    
3382
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3383
{
3384
    flag aSign, zSign;
3385
    int16 aExp, bExp, expDiff;
3386
    uint64_t aSig, bSig;
3387
    uint64_t q, alternateASig;
3388
    int64_t sigMean;
3389

    
3390
    a = float64_squash_input_denormal(a STATUS_VAR);
3391
    b = float64_squash_input_denormal(b STATUS_VAR);
3392
    aSig = extractFloat64Frac( a );
3393
    aExp = extractFloat64Exp( a );
3394
    aSign = extractFloat64Sign( a );
3395
    bSig = extractFloat64Frac( b );
3396
    bExp = extractFloat64Exp( b );
3397
    if ( aExp == 0x7FF ) {
3398
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3399
            return propagateFloat64NaN( a, b STATUS_VAR );
3400
        }
3401
        float_raise( float_flag_invalid STATUS_VAR);
3402
        return float64_default_nan;
3403
    }
3404
    if ( bExp == 0x7FF ) {
3405
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3406
        return a;
3407
    }
3408
    if ( bExp == 0 ) {
3409
        if ( bSig == 0 ) {
3410
            float_raise( float_flag_invalid STATUS_VAR);
3411
            return float64_default_nan;
3412
        }
3413
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3414
    }
3415
    if ( aExp == 0 ) {
3416
        if ( aSig == 0 ) return a;
3417
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3418
    }
3419
    expDiff = aExp - bExp;
3420
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3421
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3422
    if ( expDiff < 0 ) {
3423
        if ( expDiff < -1 ) return a;
3424
        aSig >>= 1;
3425
    }
3426
    q = ( bSig <= aSig );
3427
    if ( q ) aSig -= bSig;
3428
    expDiff -= 64;
3429
    while ( 0 < expDiff ) {
3430
        q = estimateDiv128To64( aSig, 0, bSig );
3431
        q = ( 2 < q ) ? q - 2 : 0;
3432
        aSig = - ( ( bSig>>2 ) * q );
3433
        expDiff -= 62;
3434
    }
3435
    expDiff += 64;
3436
    if ( 0 < expDiff ) {
3437
        q = estimateDiv128To64( aSig, 0, bSig );
3438
        q = ( 2 < q ) ? q - 2 : 0;
3439
        q >>= 64 - expDiff;
3440
        bSig >>= 2;
3441
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3442
    }
3443
    else {
3444
        aSig >>= 2;
3445
        bSig >>= 2;
3446
    }
3447
    do {
3448
        alternateASig = aSig;
3449
        ++q;
3450
        aSig -= bSig;
3451
    } while ( 0 <= (int64_t) aSig );
3452
    sigMean = aSig + alternateASig;
3453
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3454
        aSig = alternateASig;
3455
    }
3456
    zSign = ( (int64_t) aSig < 0 );
3457
    if ( zSign ) aSig = - aSig;
3458
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3459

    
3460
}
3461

    
3462
/*----------------------------------------------------------------------------
3463
| Returns the square root of the double-precision floating-point value `a'.
3464
| The operation is performed according to the IEC/IEEE Standard for Binary
3465
| Floating-Point Arithmetic.
3466
*----------------------------------------------------------------------------*/
3467

    
3468
float64 float64_sqrt( float64 a STATUS_PARAM )
3469
{
3470
    flag aSign;
3471
    int16 aExp, zExp;
3472
    uint64_t aSig, zSig, doubleZSig;
3473
    uint64_t rem0, rem1, term0, term1;
3474
    a = float64_squash_input_denormal(a STATUS_VAR);
3475

    
3476
    aSig = extractFloat64Frac( a );
3477
    aExp = extractFloat64Exp( a );
3478
    aSign = extractFloat64Sign( a );
3479
    if ( aExp == 0x7FF ) {
3480
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3481
        if ( ! aSign ) return a;
3482
        float_raise( float_flag_invalid STATUS_VAR);
3483
        return float64_default_nan;
3484
    }
3485
    if ( aSign ) {
3486
        if ( ( aExp | aSig ) == 0 ) return a;
3487
        float_raise( float_flag_invalid STATUS_VAR);
3488
        return float64_default_nan;
3489
    }
3490
    if ( aExp == 0 ) {
3491
        if ( aSig == 0 ) return float64_zero;
3492
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3493
    }
3494
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3495
    aSig |= LIT64( 0x0010000000000000 );
3496
    zSig = estimateSqrt32( aExp, aSig>>21 );
3497
    aSig <<= 9 - ( aExp & 1 );
3498
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3499
    if ( ( zSig & 0x1FF ) <= 5 ) {
3500
        doubleZSig = zSig<<1;
3501
        mul64To128( zSig, zSig, &term0, &term1 );
3502
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3503
        while ( (int64_t) rem0 < 0 ) {
3504
            --zSig;
3505
            doubleZSig -= 2;
3506
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3507
        }
3508
        zSig |= ( ( rem0 | rem1 ) != 0 );
3509
    }
3510
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3511

    
3512
}
3513

    
3514
/*----------------------------------------------------------------------------
3515
| Returns the binary log of the double-precision floating-point value `a'.
3516
| The operation is performed according to the IEC/IEEE Standard for Binary
3517
| Floating-Point Arithmetic.
3518
*----------------------------------------------------------------------------*/
3519
float64 float64_log2( float64 a STATUS_PARAM )
3520
{
3521
    flag aSign, zSign;
3522
    int16 aExp;
3523
    uint64_t aSig, aSig0, aSig1, zSig, i;
3524
    a = float64_squash_input_denormal(a STATUS_VAR);
3525

    
3526
    aSig = extractFloat64Frac( a );
3527
    aExp = extractFloat64Exp( a );
3528
    aSign = extractFloat64Sign( a );
3529

    
3530
    if ( aExp == 0 ) {
3531
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3532
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3533
    }
3534
    if ( aSign ) {
3535
        float_raise( float_flag_invalid STATUS_VAR);
3536
        return float64_default_nan;
3537
    }
3538
    if ( aExp == 0x7FF ) {
3539
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3540
        return a;
3541
    }
3542

    
3543
    aExp -= 0x3FF;
3544
    aSig |= LIT64( 0x0010000000000000 );
3545
    zSign = aExp < 0;
3546
    zSig = (uint64_t)aExp << 52;
3547
    for (i = 1LL << 51; i > 0; i >>= 1) {
3548
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3549
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3550
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3551
            aSig >>= 1;
3552
            zSig |= i;
3553
        }
3554
    }
3555

    
3556
    if ( zSign )
3557
        zSig = -zSig;
3558
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3559
}
3560

    
3561
/*----------------------------------------------------------------------------
3562
| Returns 1 if the double-precision floating-point value `a' is equal to the
3563
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3564
| if either operand is a NaN.  Otherwise, the comparison is performed
3565
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3566
*----------------------------------------------------------------------------*/
3567

    
3568
int float64_eq( float64 a, float64 b STATUS_PARAM )
3569
{
3570
    uint64_t av, bv;
3571
    a = float64_squash_input_denormal(a STATUS_VAR);
3572
    b = float64_squash_input_denormal(b STATUS_VAR);
3573

    
3574
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3575
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3576
       ) {
3577
        float_raise( float_flag_invalid STATUS_VAR);
3578
        return 0;
3579
    }
3580
    av = float64_val(a);
3581
    bv = float64_val(b);
3582
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3583

    
3584
}
3585

    
3586
/*----------------------------------------------------------------------------
3587
| Returns 1 if the double-precision floating-point value `a' is less than or
3588
| equal to the corresponding value `b', and 0 otherwise.  The invalid
3589
| exception is raised if either operand is a NaN.  The comparison is performed
3590
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591
*----------------------------------------------------------------------------*/
3592

    
3593
int float64_le( float64 a, float64 b STATUS_PARAM )
3594
{
3595
    flag aSign, bSign;
3596
    uint64_t av, bv;
3597
    a = float64_squash_input_denormal(a STATUS_VAR);
3598
    b = float64_squash_input_denormal(b STATUS_VAR);
3599

    
3600
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3602
       ) {
3603
        float_raise( float_flag_invalid STATUS_VAR);
3604
        return 0;
3605
    }
3606
    aSign = extractFloat64Sign( a );
3607
    bSign = extractFloat64Sign( b );
3608
    av = float64_val(a);
3609
    bv = float64_val(b);
3610
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3611
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3612

    
3613
}
3614

    
3615
/*----------------------------------------------------------------------------
3616
| Returns 1 if the double-precision floating-point value `a' is less than
3617
| the corresponding value `b', and 0 otherwise.  The invalid exception is
3618
| raised if either operand is a NaN.  The comparison is performed according
3619
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620
*----------------------------------------------------------------------------*/
3621

    
3622
int float64_lt( float64 a, float64 b STATUS_PARAM )
3623
{
3624
    flag aSign, bSign;
3625
    uint64_t av, bv;
3626

    
3627
    a = float64_squash_input_denormal(a STATUS_VAR);
3628
    b = float64_squash_input_denormal(b STATUS_VAR);
3629
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3630
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3631
       ) {
3632
        float_raise( float_flag_invalid STATUS_VAR);
3633
        return 0;
3634
    }
3635
    aSign = extractFloat64Sign( a );
3636
    bSign = extractFloat64Sign( b );
3637
    av = float64_val(a);
3638
    bv = float64_val(b);
3639
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3640
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3641

    
3642
}
3643

    
3644
/*----------------------------------------------------------------------------
3645
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3646
| be compared, and 0 otherwise.  The invalid exception is raised if either
3647
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
3648
| Standard for Binary Floating-Point Arithmetic.
3649
*----------------------------------------------------------------------------*/
3650

    
3651
int float64_unordered( float64 a, float64 b STATUS_PARAM )
3652
{
3653
    a = float64_squash_input_denormal(a STATUS_VAR);
3654
    b = float64_squash_input_denormal(b STATUS_VAR);
3655

    
3656
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3657
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3658
       ) {
3659
        float_raise( float_flag_invalid STATUS_VAR);
3660
        return 1;
3661
    }
3662
    return 0;
3663
}
3664

    
3665
/*----------------------------------------------------------------------------
3666
| Returns 1 if the double-precision floating-point value `a' is equal to the
3667
| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3668
| exception.The comparison is performed according to the IEC/IEEE Standard
3669
| for Binary Floating-Point Arithmetic.
3670
*----------------------------------------------------------------------------*/
3671

    
3672
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3673
{
3674
    uint64_t av, bv;
3675
    a = float64_squash_input_denormal(a STATUS_VAR);
3676
    b = float64_squash_input_denormal(b STATUS_VAR);
3677

    
3678
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3679
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3680
       ) {
3681
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3682
            float_raise( float_flag_invalid STATUS_VAR);
3683
        }
3684
        return 0;
3685
    }
3686
    av = float64_val(a);
3687
    bv = float64_val(b);
3688
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3689

    
3690
}
3691

    
3692
/*----------------------------------------------------------------------------
3693
| Returns 1 if the double-precision floating-point value `a' is less than or
3694
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3695
| cause an exception.  Otherwise, the comparison is performed according to the
3696
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3697
*----------------------------------------------------------------------------*/
3698

    
3699
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3700
{
3701
    flag aSign, bSign;
3702
    uint64_t av, bv;
3703
    a = float64_squash_input_denormal(a STATUS_VAR);
3704
    b = float64_squash_input_denormal(b STATUS_VAR);
3705

    
3706
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3707
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3708
       ) {
3709
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3710
            float_raise( float_flag_invalid STATUS_VAR);
3711
        }
3712
        return 0;
3713
    }
3714
    aSign = extractFloat64Sign( a );
3715
    bSign = extractFloat64Sign( b );
3716
    av = float64_val(a);
3717
    bv = float64_val(b);
3718
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3719
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3720

    
3721
}
3722

    
3723
/*----------------------------------------------------------------------------
3724
| Returns 1 if the double-precision floating-point value `a' is less than
3725
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3726
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3727
| Standard for Binary Floating-Point Arithmetic.
3728
*----------------------------------------------------------------------------*/
3729

    
3730
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3731
{
3732
    flag aSign, bSign;
3733
    uint64_t av, bv;
3734
    a = float64_squash_input_denormal(a STATUS_VAR);
3735
    b = float64_squash_input_denormal(b STATUS_VAR);
3736

    
3737
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3738
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3739
       ) {
3740
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3741
            float_raise( float_flag_invalid STATUS_VAR);
3742
        }
3743
        return 0;
3744
    }
3745
    aSign = extractFloat64Sign( a );
3746
    bSign = extractFloat64Sign( b );
3747
    av = float64_val(a);
3748
    bv = float64_val(b);
3749
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3750
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3751

    
3752
}
3753

    
3754
/*----------------------------------------------------------------------------
3755
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3756
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3757
| comparison is performed according to the IEC/IEEE Standard for Binary
3758
| Floating-Point Arithmetic.
3759
*----------------------------------------------------------------------------*/
3760

    
3761
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3762
{
3763
    a = float64_squash_input_denormal(a STATUS_VAR);
3764
    b = float64_squash_input_denormal(b STATUS_VAR);
3765

    
3766
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3767
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3768
       ) {
3769
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3770
            float_raise( float_flag_invalid STATUS_VAR);
3771
        }
3772
        return 1;
3773
    }
3774
    return 0;
3775
}
3776

    
3777
/*----------------------------------------------------------------------------
3778
| Returns the result of converting the extended double-precision floating-
3779
| point value `a' to the 32-bit two's complement integer format.  The
3780
| conversion is performed according to the IEC/IEEE Standard for Binary
3781
| Floating-Point Arithmetic---which means in particular that the conversion
3782
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3783
| largest positive integer is returned.  Otherwise, if the conversion
3784
| overflows, the largest integer with the same sign as `a' is returned.
3785
*----------------------------------------------------------------------------*/
3786

    
3787
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3788
{
3789
    flag aSign;
3790
    int32 aExp, shiftCount;
3791
    uint64_t aSig;
3792

    
3793
    aSig = extractFloatx80Frac( a );
3794
    aExp = extractFloatx80Exp( a );
3795
    aSign = extractFloatx80Sign( a );
3796
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3797
    shiftCount = 0x4037 - aExp;
3798
    if ( shiftCount <= 0 ) shiftCount = 1;
3799
    shift64RightJamming( aSig, shiftCount, &aSig );
3800
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3801

    
3802
}
3803

    
3804
/*----------------------------------------------------------------------------
3805
| Returns the result of converting the extended double-precision floating-
3806
| point value `a' to the 32-bit two's complement integer format.  The
3807
| conversion is performed according to the IEC/IEEE Standard for Binary
3808
| Floating-Point Arithmetic, except that the conversion is always rounded
3809
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3810
| Otherwise, if the conversion overflows, the largest integer with the same
3811
| sign as `a' is returned.
3812
*----------------------------------------------------------------------------*/
3813

    
3814
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3815
{
3816
    flag aSign;
3817
    int32 aExp, shiftCount;
3818
    uint64_t aSig, savedASig;
3819
    int32 z;
3820

    
3821
    aSig = extractFloatx80Frac( a );
3822
    aExp = extractFloatx80Exp( a );
3823
    aSign = extractFloatx80Sign( a );
3824
    if ( 0x401E < aExp ) {
3825
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3826
        goto invalid;
3827
    }
3828
    else if ( aExp < 0x3FFF ) {
3829
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3830
        return 0;
3831
    }
3832
    shiftCount = 0x403E - aExp;
3833
    savedASig = aSig;
3834
    aSig >>= shiftCount;
3835
    z = aSig;
3836
    if ( aSign ) z = - z;
3837
    if ( ( z < 0 ) ^ aSign ) {
3838
 invalid:
3839
        float_raise( float_flag_invalid STATUS_VAR);
3840
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3841
    }
3842
    if ( ( aSig<<shiftCount ) != savedASig ) {
3843
        STATUS(float_exception_flags) |= float_flag_inexact;
3844
    }
3845
    return z;
3846

    
3847
}
3848

    
3849
/*----------------------------------------------------------------------------
3850
| Returns the result of converting the extended double-precision floating-
3851
| point value `a' to the 64-bit two's complement integer format.  The
3852
| conversion is performed according to the IEC/IEEE Standard for Binary
3853
| Floating-Point Arithmetic---which means in particular that the conversion
3854
| is rounded according to the current rounding mode.  If `a' is a NaN,
3855
| the largest positive integer is returned.  Otherwise, if the conversion
3856
| overflows, the largest integer with the same sign as `a' is returned.
3857
*----------------------------------------------------------------------------*/
3858

    
3859
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3860
{
3861
    flag aSign;
3862
    int32 aExp, shiftCount;
3863
    uint64_t aSig, aSigExtra;
3864

    
3865
    aSig = extractFloatx80Frac( a );
3866
    aExp = extractFloatx80Exp( a );
3867
    aSign = extractFloatx80Sign( a );
3868
    shiftCount = 0x403E - aExp;
3869
    if ( shiftCount <= 0 ) {
3870
        if ( shiftCount ) {
3871
            float_raise( float_flag_invalid STATUS_VAR);
3872
            if (    ! aSign
3873
                 || (    ( aExp == 0x7FFF )
3874
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3875
               ) {
3876
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3877
            }
3878
            return (int64_t) LIT64( 0x8000000000000000 );
3879
        }
3880
        aSigExtra = 0;
3881
    }
3882
    else {
3883
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3884
    }
3885
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3886

    
3887
}
3888

    
3889
/*----------------------------------------------------------------------------
3890
| Returns the result of converting the extended double-precision floating-
3891
| point value `a' to the 64-bit two's complement integer format.  The
3892
| conversion is performed according to the IEC/IEEE Standard for Binary
3893
| Floating-Point Arithmetic, except that the conversion is always rounded
3894
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3895
| Otherwise, if the conversion overflows, the largest integer with the same
3896
| sign as `a' is returned.
3897
*----------------------------------------------------------------------------*/
3898

    
3899
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3900
{
3901
    flag aSign;
3902
    int32 aExp, shiftCount;
3903
    uint64_t aSig;
3904
    int64 z;
3905

    
3906
    aSig = extractFloatx80Frac( a );
3907
    aExp = extractFloatx80Exp( a );
3908
    aSign = extractFloatx80Sign( a );
3909
    shiftCount = aExp - 0x403E;
3910
    if ( 0 <= shiftCount ) {
3911
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3912
        if ( ( a.high != 0xC03E ) || aSig ) {
3913
            float_raise( float_flag_invalid STATUS_VAR);
3914
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3915
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3916
            }
3917
        }
3918
        return (int64_t) LIT64( 0x8000000000000000 );
3919
    }
3920
    else if ( aExp < 0x3FFF ) {
3921
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3922
        return 0;
3923
    }
3924
    z = aSig>>( - shiftCount );
3925
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3926
        STATUS(float_exception_flags) |= float_flag_inexact;
3927
    }
3928
    if ( aSign ) z = - z;
3929
    return z;
3930

    
3931
}
3932

    
3933
/*----------------------------------------------------------------------------
3934
| Returns the result of converting the extended double-precision floating-
3935
| point value `a' to the single-precision floating-point format.  The
3936
| conversion is performed according to the IEC/IEEE Standard for Binary
3937
| Floating-Point Arithmetic.
3938
*----------------------------------------------------------------------------*/
3939

    
3940
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3941
{
3942
    flag aSign;
3943
    int32 aExp;
3944
    uint64_t aSig;
3945

    
3946
    aSig = extractFloatx80Frac( a );
3947
    aExp = extractFloatx80Exp( a );
3948
    aSign = extractFloatx80Sign( a );
3949
    if ( aExp == 0x7FFF ) {
3950
        if ( (uint64_t) ( aSig<<1 ) ) {
3951
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3952
        }
3953
        return packFloat32( aSign, 0xFF, 0 );
3954
    }
3955
    shift64RightJamming( aSig, 33, &aSig );
3956
    if ( aExp || aSig ) aExp -= 0x3F81;
3957
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3958

    
3959
}
3960

    
3961
/*----------------------------------------------------------------------------
3962
| Returns the result of converting the extended double-precision floating-
3963
| point value `a' to the double-precision floating-point format.  The
3964
| conversion is performed according to the IEC/IEEE Standard for Binary
3965
| Floating-Point Arithmetic.
3966
*----------------------------------------------------------------------------*/
3967

    
3968
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3969
{
3970
    flag aSign;
3971
    int32 aExp;
3972
    uint64_t aSig, zSig;
3973

    
3974
    aSig = extractFloatx80Frac( a );
3975
    aExp = extractFloatx80Exp( a );
3976
    aSign = extractFloatx80Sign( a );
3977
    if ( aExp == 0x7FFF ) {
3978
        if ( (uint64_t) ( aSig<<1 ) ) {
3979
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3980
        }
3981
        return packFloat64( aSign, 0x7FF, 0 );
3982
    }
3983
    shift64RightJamming( aSig, 1, &zSig );
3984
    if ( aExp || aSig ) aExp -= 0x3C01;
3985
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3986

    
3987
}
3988

    
3989
/*----------------------------------------------------------------------------
3990
| Returns the result of converting the extended double-precision floating-
3991
| point value `a' to the quadruple-precision floating-point format.  The
3992
| conversion is performed according to the IEC/IEEE Standard for Binary
3993
| Floating-Point Arithmetic.
3994
*----------------------------------------------------------------------------*/
3995

    
3996
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3997
{
3998
    flag aSign;
3999
    int16 aExp;
4000
    uint64_t aSig, zSig0, zSig1;
4001

    
4002
    aSig = extractFloatx80Frac( a );
4003
    aExp = extractFloatx80Exp( a );
4004
    aSign = extractFloatx80Sign( a );
4005
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4006
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4007
    }
4008
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4009
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4010

    
4011
}
4012

    
4013
/*----------------------------------------------------------------------------
4014
| Rounds the extended double-precision floating-point value `a' to an integer,
4015
| and returns the result as an extended quadruple-precision floating-point
4016
| value.  The operation is performed according to the IEC/IEEE Standard for
4017
| Binary Floating-Point Arithmetic.
4018
*----------------------------------------------------------------------------*/
4019

    
4020
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4021
{
4022
    flag aSign;
4023
    int32 aExp;
4024
    uint64_t lastBitMask, roundBitsMask;
4025
    int8 roundingMode;
4026
    floatx80 z;
4027

    
4028
    aExp = extractFloatx80Exp( a );
4029
    if ( 0x403E <= aExp ) {
4030
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4031
            return propagateFloatx80NaN( a, a STATUS_VAR );
4032
        }
4033
        return a;
4034
    }
4035
    if ( aExp < 0x3FFF ) {
4036
        if (    ( aExp == 0 )
4037
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4038
            return a;
4039
        }
4040
        STATUS(float_exception_flags) |= float_flag_inexact;
4041
        aSign = extractFloatx80Sign( a );
4042
        switch ( STATUS(float_rounding_mode) ) {
4043
         case float_round_nearest_even:
4044
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4045
               ) {
4046
                return
4047
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4048
            }
4049
            break;
4050
         case float_round_down:
4051
            return
4052
                  aSign ?
4053
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4054
                : packFloatx80( 0, 0, 0 );
4055
         case float_round_up:
4056
            return
4057
                  aSign ? packFloatx80( 1, 0, 0 )
4058
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4059
        }
4060
        return packFloatx80( aSign, 0, 0 );
4061
    }
4062
    lastBitMask = 1;
4063
    lastBitMask <<= 0x403E - aExp;
4064
    roundBitsMask = lastBitMask - 1;
4065
    z = a;
4066
    roundingMode = STATUS(float_rounding_mode);
4067
    if ( roundingMode == float_round_nearest_even ) {
4068
        z.low += lastBitMask>>1;
4069
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4070
    }
4071
    else if ( roundingMode != float_round_to_zero ) {
4072
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4073
            z.low += roundBitsMask;
4074
        }
4075
    }
4076
    z.low &= ~ roundBitsMask;
4077
    if ( z.low == 0 ) {
4078
        ++z.high;
4079
        z.low = LIT64( 0x8000000000000000 );
4080
    }
4081
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4082
    return z;
4083

    
4084
}
4085

    
4086
/*----------------------------------------------------------------------------
4087
| Returns the result of adding the absolute values of the extended double-
4088
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4089
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4090
| The addition is performed according to the IEC/IEEE Standard for Binary
4091
| Floating-Point Arithmetic.
4092
*----------------------------------------------------------------------------*/
4093

    
4094
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4095
{
4096
    int32 aExp, bExp, zExp;
4097
    uint64_t aSig, bSig, zSig0, zSig1;
4098
    int32 expDiff;
4099

    
4100
    aSig = extractFloatx80Frac( a );
4101
    aExp = extractFloatx80Exp( a );
4102
    bSig = extractFloatx80Frac( b );
4103
    bExp = extractFloatx80Exp( b );
4104
    expDiff = aExp - bExp;
4105
    if ( 0 < expDiff ) {
4106
        if ( aExp == 0x7FFF ) {
4107
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4108
            return a;
4109
        }
4110
        if ( bExp == 0 ) --expDiff;
4111
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4112
        zExp = aExp;
4113
    }
4114
    else if ( expDiff < 0 ) {
4115
        if ( bExp == 0x7FFF ) {
4116
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4117
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4118
        }
4119
        if ( aExp == 0 ) ++expDiff;
4120
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4121
        zExp = bExp;
4122
    }
4123
    else {
4124
        if ( aExp == 0x7FFF ) {
4125
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4126
                return propagateFloatx80NaN( a, b STATUS_VAR );
4127
            }
4128
            return a;
4129
        }
4130
        zSig1 = 0;
4131
        zSig0 = aSig + bSig;
4132
        if ( aExp == 0 ) {
4133
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4134
            goto roundAndPack;
4135
        }
4136
        zExp = aExp;
4137
        goto shiftRight1;
4138
    }
4139
    zSig0 = aSig + bSig;
4140
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4141
 shiftRight1:
4142
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4143
    zSig0 |= LIT64( 0x8000000000000000 );
4144
    ++zExp;
4145
 roundAndPack:
4146
    return
4147
        roundAndPackFloatx80(
4148
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4149

    
4150
}
4151

    
4152
/*----------------------------------------------------------------------------
4153
| Returns the result of subtracting the absolute values of the extended
4154
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4155
| difference is negated before being returned.  `zSign' is ignored if the
4156
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4157
| Standard for Binary Floating-Point Arithmetic.
4158
*----------------------------------------------------------------------------*/
4159

    
4160
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4161
{
4162
    int32 aExp, bExp, zExp;
4163
    uint64_t aSig, bSig, zSig0, zSig1;
4164
    int32 expDiff;
4165
    floatx80 z;
4166

    
4167
    aSig = extractFloatx80Frac( a );
4168
    aExp = extractFloatx80Exp( a );
4169
    bSig = extractFloatx80Frac( b );
4170
    bExp = extractFloatx80Exp( b );
4171
    expDiff = aExp - bExp;
4172
    if ( 0 < expDiff ) goto aExpBigger;
4173
    if ( expDiff < 0 ) goto bExpBigger;
4174
    if ( aExp == 0x7FFF ) {
4175
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4176
            return propagateFloatx80NaN( a, b STATUS_VAR );
4177
        }
4178
        float_raise( float_flag_invalid STATUS_VAR);
4179
        z.low = floatx80_default_nan_low;
4180
        z.high = floatx80_default_nan_high;
4181
        return z;
4182
    }
4183
    if ( aExp == 0 ) {
4184
        aExp = 1;
4185
        bExp = 1;
4186
    }
4187
    zSig1 = 0;
4188
    if ( bSig < aSig ) goto aBigger;
4189
    if ( aSig < bSig ) goto bBigger;
4190
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4191
 bExpBigger:
4192
    if ( bExp == 0x7FFF ) {
4193
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4194
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4195
    }
4196
    if ( aExp == 0 ) ++expDiff;
4197
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4198
 bBigger:
4199
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4200
    zExp = bExp;
4201
    zSign ^= 1;
4202
    goto normalizeRoundAndPack;
4203
 aExpBigger:
4204
    if ( aExp == 0x7FFF ) {
4205
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4206
        return a;
4207
    }
4208
    if ( bExp == 0 ) --expDiff;
4209
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4210
 aBigger:
4211
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4212
    zExp = aExp;
4213
 normalizeRoundAndPack:
4214
    return
4215
        normalizeRoundAndPackFloatx80(
4216
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4217

    
4218
}
4219

    
4220
/*----------------------------------------------------------------------------
4221
| Returns the result of adding the extended double-precision floating-point
4222
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4223
| Standard for Binary Floating-Point Arithmetic.
4224
*----------------------------------------------------------------------------*/
4225

    
4226
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4227
{
4228
    flag aSign, bSign;
4229

    
4230
    aSign = extractFloatx80Sign( a );
4231
    bSign = extractFloatx80Sign( b );
4232
    if ( aSign == bSign ) {
4233
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4234
    }
4235
    else {
4236
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4237
    }
4238

    
4239
}
4240

    
4241
/*----------------------------------------------------------------------------
4242
| Returns the result of subtracting the extended double-precision floating-
4243
| point values `a' and `b'.  The operation is performed according to the
4244
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4245
*----------------------------------------------------------------------------*/
4246

    
4247
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4248
{
4249
    flag aSign, bSign;
4250

    
4251
    aSign = extractFloatx80Sign( a );
4252
    bSign = extractFloatx80Sign( b );
4253
    if ( aSign == bSign ) {
4254
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4255
    }
4256
    else {
4257
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4258
    }
4259

    
4260
}
4261

    
4262
/*----------------------------------------------------------------------------
4263
| Returns the result of multiplying the extended double-precision floating-
4264
| point values `a' and `b'.  The operation is performed according to the
4265
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266
*----------------------------------------------------------------------------*/
4267

    
4268
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4269
{
4270
    flag aSign, bSign, zSign;
4271
    int32 aExp, bExp, zExp;
4272
    uint64_t aSig, bSig, zSig0, zSig1;
4273
    floatx80 z;
4274

    
4275
    aSig = extractFloatx80Frac( a );
4276
    aExp = extractFloatx80Exp( a );
4277
    aSign = extractFloatx80Sign( a );
4278
    bSig = extractFloatx80Frac( b );
4279
    bExp = extractFloatx80Exp( b );
4280
    bSign = extractFloatx80Sign( b );
4281
    zSign = aSign ^ bSign;
4282
    if ( aExp == 0x7FFF ) {
4283
        if (    (uint64_t) ( aSig<<1 )
4284
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4285
            return propagateFloatx80NaN( a, b STATUS_VAR );
4286
        }
4287
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4288
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4289
    }
4290
    if ( bExp == 0x7FFF ) {
4291
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4292
        if ( ( aExp | aSig ) == 0 ) {
4293
 invalid:
4294
            float_raise( float_flag_invalid STATUS_VAR);
4295
            z.low = floatx80_default_nan_low;
4296
            z.high = floatx80_default_nan_high;
4297
            return z;
4298
        }
4299
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4300
    }
4301
    if ( aExp == 0 ) {
4302
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4303
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4304
    }
4305
    if ( bExp == 0 ) {
4306
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4307
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4308
    }
4309
    zExp = aExp + bExp - 0x3FFE;
4310
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4311
    if ( 0 < (int64_t) zSig0 ) {
4312
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4313
        --zExp;
4314
    }
4315
    return
4316
        roundAndPackFloatx80(
4317
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4318

    
4319
}
4320

    
4321
/*----------------------------------------------------------------------------
4322
| Returns the result of dividing the extended double-precision floating-point
4323
| value `a' by the corresponding value `b'.  The operation is performed
4324
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4325
*----------------------------------------------------------------------------*/
4326

    
4327
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4328
{
4329
    flag aSign, bSign, zSign;
4330
    int32 aExp, bExp, zExp;
4331
    uint64_t aSig, bSig, zSig0, zSig1;
4332
    uint64_t rem0, rem1, rem2, term0, term1, term2;
4333
    floatx80 z;
4334

    
4335
    aSig = extractFloatx80Frac( a );
4336
    aExp = extractFloatx80Exp( a );
4337
    aSign = extractFloatx80Sign( a );
4338
    bSig = extractFloatx80Frac( b );
4339
    bExp = extractFloatx80Exp( b );
4340
    bSign = extractFloatx80Sign( b );
4341
    zSign = aSign ^ bSign;
4342
    if ( aExp == 0x7FFF ) {
4343
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4344
        if ( bExp == 0x7FFF ) {
4345
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4346
            goto invalid;
4347
        }
4348
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4349
    }
4350
    if ( bExp == 0x7FFF ) {
4351
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4352
        return packFloatx80( zSign, 0, 0 );
4353
    }
4354
    if ( bExp == 0 ) {
4355
        if ( bSig == 0 ) {
4356
            if ( ( aExp | aSig ) == 0 ) {
4357
 invalid:
4358
                float_raise( float_flag_invalid STATUS_VAR);
4359
                z.low = floatx80_default_nan_low;
4360
                z.high = floatx80_default_nan_high;
4361
                return z;
4362
            }
4363
            float_raise( float_flag_divbyzero STATUS_VAR);
4364
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4365
        }
4366
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4367
    }
4368
    if ( aExp == 0 ) {
4369
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4370
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4371
    }
4372
    zExp = aExp - bExp + 0x3FFE;
4373
    rem1 = 0;
4374
    if ( bSig <= aSig ) {
4375
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4376
        ++zExp;
4377
    }
4378
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4379
    mul64To128( bSig, zSig0, &term0, &term1 );
4380
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4381
    while ( (int64_t) rem0 < 0 ) {
4382
        --zSig0;
4383
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4384
    }
4385
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4386
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4387
        mul64To128( bSig, zSig1, &term1, &term2 );
4388
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4389
        while ( (int64_t) rem1 < 0 ) {
4390
            --zSig1;
4391
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4392
        }
4393
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4394
    }
4395
    return
4396
        roundAndPackFloatx80(
4397
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4398

    
4399
}
4400

    
4401
/*----------------------------------------------------------------------------
4402
| Returns the remainder of the extended double-precision floating-point value
4403
| `a' with respect to the corresponding value `b'.  The operation is performed
4404
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4405
*----------------------------------------------------------------------------*/
4406

    
4407
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4408
{
4409
    flag aSign, zSign;
4410
    int32 aExp, bExp, expDiff;
4411
    uint64_t aSig0, aSig1, bSig;
4412
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4413
    floatx80 z;
4414

    
4415
    aSig0 = extractFloatx80Frac( a );
4416
    aExp = extractFloatx80Exp( a );
4417
    aSign = extractFloatx80Sign( a );
4418
    bSig = extractFloatx80Frac( b );
4419
    bExp = extractFloatx80Exp( b );
4420
    if ( aExp == 0x7FFF ) {
4421
        if (    (uint64_t) ( aSig0<<1 )
4422
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4423
            return propagateFloatx80NaN( a, b STATUS_VAR );
4424
        }
4425
        goto invalid;
4426
    }
4427
    if ( bExp == 0x7FFF ) {
4428
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4429
        return a;
4430
    }
4431
    if ( bExp == 0 ) {
4432
        if ( bSig == 0 ) {
4433
 invalid:
4434
            float_raise( float_flag_invalid STATUS_VAR);
4435
            z.low = floatx80_default_nan_low;
4436
            z.high = floatx80_default_nan_high;
4437
            return z;
4438
        }
4439
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4440
    }
4441
    if ( aExp == 0 ) {
4442
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4443
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4444
    }
4445
    bSig |= LIT64( 0x8000000000000000 );
4446
    zSign = aSign;
4447
    expDiff = aExp - bExp;
4448
    aSig1 = 0;
4449
    if ( expDiff < 0 ) {
4450
        if ( expDiff < -1 ) return a;
4451
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4452
        expDiff = 0;
4453
    }
4454
    q = ( bSig <= aSig0 );
4455
    if ( q ) aSig0 -= bSig;
4456
    expDiff -= 64;
4457
    while ( 0 < expDiff ) {
4458
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4459
        q = ( 2 < q ) ? q - 2 : 0;
4460
        mul64To128( bSig, q, &term0, &term1 );
4461
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4462
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4463
        expDiff -= 62;
4464
    }
4465
    expDiff += 64;
4466
    if ( 0 < expDiff ) {
4467
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4468
        q = ( 2 < q ) ? q - 2 : 0;
4469
        q >>= 64 - expDiff;
4470
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4471
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4472
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4473
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4474
            ++q;
4475
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4476
        }
4477
    }
4478
    else {
4479
        term1 = 0;
4480
        term0 = bSig;
4481
    }
4482
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4483
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4484
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4485
              && ( q & 1 ) )
4486
       ) {
4487
        aSig0 = alternateASig0;
4488
        aSig1 = alternateASig1;
4489
        zSign = ! zSign;
4490
    }
4491
    return
4492
        normalizeRoundAndPackFloatx80(
4493
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4494

    
4495
}
4496

    
4497
/*----------------------------------------------------------------------------
4498
| Returns the square root of the extended double-precision floating-point
4499
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4500
| for Binary Floating-Point Arithmetic.
4501
*----------------------------------------------------------------------------*/
4502

    
4503
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4504
{
4505
    flag aSign;
4506
    int32 aExp, zExp;
4507
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4508
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4509
    floatx80 z;
4510

    
4511
    aSig0 = extractFloatx80Frac( a );
4512
    aExp = extractFloatx80Exp( a );
4513
    aSign = extractFloatx80Sign( a );
4514
    if ( aExp == 0x7FFF ) {
4515
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4516
        if ( ! aSign ) return a;
4517
        goto invalid;
4518
    }
4519
    if ( aSign ) {
4520
        if ( ( aExp | aSig0 ) == 0 ) return a;
4521
 invalid:
4522
        float_raise( float_flag_invalid STATUS_VAR);
4523
        z.low = floatx80_default_nan_low;
4524
        z.high = floatx80_default_nan_high;
4525
        return z;
4526
    }
4527
    if ( aExp == 0 ) {
4528
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4529
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4530
    }
4531
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4532
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4533
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4534
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4535
    doubleZSig0 = zSig0<<1;
4536
    mul64To128( zSig0, zSig0, &term0, &term1 );
4537
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4538
    while ( (int64_t) rem0 < 0 ) {
4539
        --zSig0;
4540
        doubleZSig0 -= 2;
4541
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4542
    }
4543
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4544
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4545
        if ( zSig1 == 0 ) zSig1 = 1;
4546
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4547
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4548
        mul64To128( zSig1, zSig1, &term2, &term3 );
4549
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4550
        while ( (int64_t) rem1 < 0 ) {
4551
            --zSig1;
4552
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4553
            term3 |= 1;
4554
            term2 |= doubleZSig0;
4555
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4556
        }
4557
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4558
    }
4559
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4560
    zSig0 |= doubleZSig0;
4561
    return
4562
        roundAndPackFloatx80(
4563
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4564

    
4565
}
4566

    
4567
/*----------------------------------------------------------------------------
4568
| Returns 1 if the extended double-precision floating-point value `a' is equal
4569
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4570
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4571
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572
*----------------------------------------------------------------------------*/
4573

    
4574
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4575
{
4576

    
4577
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4578
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4579
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4580
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4581
       ) {
4582
        float_raise( float_flag_invalid STATUS_VAR);
4583
        return 0;
4584
    }
4585
    return
4586
           ( a.low == b.low )
4587
        && (    ( a.high == b.high )
4588
             || (    ( a.low == 0 )
4589
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4590
           );
4591

    
4592
}
4593

    
4594
/*----------------------------------------------------------------------------
4595
| Returns 1 if the extended double-precision floating-point value `a' is
4596
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4597
| invalid exception is raised if either operand is a NaN.  The comparison is
4598
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4599
| Arithmetic.
4600
*----------------------------------------------------------------------------*/
4601

    
4602
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4603
{
4604
    flag aSign, bSign;
4605

    
4606
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4607
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4608
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4609
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4610
       ) {
4611
        float_raise( float_flag_invalid STATUS_VAR);
4612
        return 0;
4613
    }
4614
    aSign = extractFloatx80Sign( a );
4615
    bSign = extractFloatx80Sign( b );
4616
    if ( aSign != bSign ) {
4617
        return
4618
               aSign
4619
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4620
                 == 0 );
4621
    }
4622
    return
4623
          aSign ? le128( b.high, b.low, a.high, a.low )
4624
        : le128( a.high, a.low, b.high, b.low );
4625

    
4626
}
4627

    
4628
/*----------------------------------------------------------------------------
4629
| Returns 1 if the extended double-precision floating-point value `a' is
4630
| less than the corresponding value `b', and 0 otherwise.  The invalid
4631
| exception is raised if either operand is a NaN.  The comparison is performed
4632
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4633
*----------------------------------------------------------------------------*/
4634

    
4635
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4636
{
4637
    flag aSign, bSign;
4638

    
4639
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4640
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4641
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4642
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4643
       ) {
4644
        float_raise( float_flag_invalid STATUS_VAR);
4645
        return 0;
4646
    }
4647
    aSign = extractFloatx80Sign( a );
4648
    bSign = extractFloatx80Sign( b );
4649
    if ( aSign != bSign ) {
4650
        return
4651
               aSign
4652
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4653
                 != 0 );
4654
    }
4655
    return
4656
          aSign ? lt128( b.high, b.low, a.high, a.low )
4657
        : lt128( a.high, a.low, b.high, b.low );
4658

    
4659
}
4660

    
4661
/*----------------------------------------------------------------------------
4662
| Returns 1 if the extended double-precision floating-point values `a' and `b'
4663
| cannot be compared, and 0 otherwise.  The invalid exception is raised if
4664
| either operand is a NaN.   The comparison is performed according to the
4665
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4666
*----------------------------------------------------------------------------*/
4667
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4668
{
4669
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4670
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4671
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4672
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4673
       ) {
4674
        float_raise( float_flag_invalid STATUS_VAR);
4675
        return 1;
4676
    }
4677
    return 0;
4678
}
4679

    
4680
/*----------------------------------------------------------------------------
4681
| Returns 1 if the extended double-precision floating-point value `a' is
4682
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4683
| cause an exception.  The comparison is performed according to the IEC/IEEE
4684
| Standard for Binary Floating-Point Arithmetic.
4685
*----------------------------------------------------------------------------*/
4686

    
4687
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4688
{
4689

    
4690
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4691
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4692
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4693
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4694
       ) {
4695
        if (    floatx80_is_signaling_nan( a )
4696
             || floatx80_is_signaling_nan( b ) ) {
4697
            float_raise( float_flag_invalid STATUS_VAR);
4698
        }
4699
        return 0;
4700
    }
4701
    return
4702
           ( a.low == b.low )
4703
        && (    ( a.high == b.high )
4704
             || (    ( a.low == 0 )
4705
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4706
           );
4707

    
4708
}
4709

    
4710
/*----------------------------------------------------------------------------
4711
| Returns 1 if the extended double-precision floating-point value `a' is less
4712
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4713
| do not cause an exception.  Otherwise, the comparison is performed according
4714
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4715
*----------------------------------------------------------------------------*/
4716

    
4717
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4718
{
4719
    flag aSign, bSign;
4720

    
4721
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4722
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4723
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4724
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4725
       ) {
4726
        if (    floatx80_is_signaling_nan( a )
4727
             || floatx80_is_signaling_nan( b ) ) {
4728
            float_raise( float_flag_invalid STATUS_VAR);
4729
        }
4730
        return 0;
4731
    }
4732
    aSign = extractFloatx80Sign( a );
4733
    bSign = extractFloatx80Sign( b );
4734
    if ( aSign != bSign ) {
4735
        return
4736
               aSign
4737
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4738
                 == 0 );
4739
    }
4740
    return
4741
          aSign ? le128( b.high, b.low, a.high, a.low )
4742
        : le128( a.high, a.low, b.high, b.low );
4743

    
4744
}
4745

    
4746
/*----------------------------------------------------------------------------
4747
| Returns 1 if the extended double-precision floating-point value `a' is less
4748
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4749
| an exception.  Otherwise, the comparison is performed according to the
4750
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4751
*----------------------------------------------------------------------------*/
4752

    
4753
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4754
{
4755
    flag aSign, bSign;
4756

    
4757
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4758
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4759
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4760
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4761
       ) {
4762
        if (    floatx80_is_signaling_nan( a )
4763
             || floatx80_is_signaling_nan( b ) ) {
4764
            float_raise( float_flag_invalid STATUS_VAR);
4765
        }
4766
        return 0;
4767
    }
4768
    aSign = extractFloatx80Sign( a );
4769
    bSign = extractFloatx80Sign( b );
4770
    if ( aSign != bSign ) {
4771
        return
4772
               aSign
4773
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4774
                 != 0 );
4775
    }
4776
    return
4777
          aSign ? lt128( b.high, b.low, a.high, a.low )
4778
        : lt128( a.high, a.low, b.high, b.low );
4779

    
4780
}
4781

    
4782
/*----------------------------------------------------------------------------
4783
| Returns 1 if the extended double-precision floating-point values `a' and `b'
4784
| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4785
| The comparison is performed according to the IEC/IEEE Standard for Binary
4786
| Floating-Point Arithmetic.
4787
*----------------------------------------------------------------------------*/
4788
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4789
{
4790
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4791
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4792
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4793
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4794
       ) {
4795
        if (    floatx80_is_signaling_nan( a )
4796
             || floatx80_is_signaling_nan( b ) ) {
4797
            float_raise( float_flag_invalid STATUS_VAR);
4798
        }
4799
        return 1;
4800
    }
4801
    return 0;
4802
}
4803

    
4804
/*----------------------------------------------------------------------------
4805
| Returns the result of converting the quadruple-precision floating-point
4806
| value `a' to the 32-bit two's complement integer format.  The conversion
4807
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4808
| Arithmetic---which means in particular that the conversion is rounded
4809
| according to the current rounding mode.  If `a' is a NaN, the largest
4810
| positive integer is returned.  Otherwise, if the conversion overflows, the
4811
| largest integer with the same sign as `a' is returned.
4812
*----------------------------------------------------------------------------*/
4813

    
4814
int32 float128_to_int32( float128 a STATUS_PARAM )
4815
{
4816
    flag aSign;
4817
    int32 aExp, shiftCount;
4818
    uint64_t aSig0, aSig1;
4819

    
4820
    aSig1 = extractFloat128Frac1( a );
4821
    aSig0 = extractFloat128Frac0( a );
4822
    aExp = extractFloat128Exp( a );
4823
    aSign = extractFloat128Sign( a );
4824
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4825
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4826
    aSig0 |= ( aSig1 != 0 );
4827
    shiftCount = 0x4028 - aExp;
4828
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4829
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4830

    
4831
}
4832

    
4833
/*----------------------------------------------------------------------------
4834
| Returns the result of converting the quadruple-precision floating-point
4835
| value `a' to the 32-bit two's complement integer format.  The conversion
4836
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4837
| Arithmetic, except that the conversion is always rounded toward zero.  If
4838
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4839
| conversion overflows, the largest integer with the same sign as `a' is
4840
| returned.
4841
*----------------------------------------------------------------------------*/
4842

    
4843
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4844
{
4845
    flag aSign;
4846
    int32 aExp, shiftCount;
4847
    uint64_t aSig0, aSig1, savedASig;
4848
    int32 z;
4849

    
4850
    aSig1 = extractFloat128Frac1( a );
4851
    aSig0 = extractFloat128Frac0( a );
4852
    aExp = extractFloat128Exp( a );
4853
    aSign = extractFloat128Sign( a );
4854
    aSig0 |= ( aSig1 != 0 );
4855
    if ( 0x401E < aExp ) {
4856
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4857
        goto invalid;
4858
    }
4859
    else if ( aExp < 0x3FFF ) {
4860
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4861
        return 0;
4862
    }
4863
    aSig0 |= LIT64( 0x0001000000000000 );
4864
    shiftCount = 0x402F - aExp;
4865
    savedASig = aSig0;
4866
    aSig0 >>= shiftCount;
4867
    z = aSig0;
4868
    if ( aSign ) z = - z;
4869
    if ( ( z < 0 ) ^ aSign ) {
4870
 invalid:
4871
        float_raise( float_flag_invalid STATUS_VAR);
4872
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4873
    }
4874
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4875
        STATUS(float_exception_flags) |= float_flag_inexact;
4876
    }
4877
    return z;
4878

    
4879
}
4880

    
4881
/*----------------------------------------------------------------------------
4882
| Returns the result of converting the quadruple-precision floating-point
4883
| value `a' to the 64-bit two's complement integer format.  The conversion
4884
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885
| Arithmetic---which means in particular that the conversion is rounded
4886
| according to the current rounding mode.  If `a' is a NaN, the largest
4887
| positive integer is returned.  Otherwise, if the conversion overflows, the
4888
| largest integer with the same sign as `a' is returned.
4889
*----------------------------------------------------------------------------*/
4890

    
4891
int64 float128_to_int64( float128 a STATUS_PARAM )
4892
{
4893
    flag aSign;
4894
    int32 aExp, shiftCount;
4895
    uint64_t aSig0, aSig1;
4896

    
4897
    aSig1 = extractFloat128Frac1( a );
4898
    aSig0 = extractFloat128Frac0( a );
4899
    aExp = extractFloat128Exp( a );
4900
    aSign = extractFloat128Sign( a );
4901
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4902
    shiftCount = 0x402F - aExp;
4903
    if ( shiftCount <= 0 ) {
4904
        if ( 0x403E < aExp ) {
4905
            float_raise( float_flag_invalid STATUS_VAR);
4906
            if (    ! aSign
4907
                 || (    ( aExp == 0x7FFF )
4908
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4909
                    )
4910
               ) {
4911
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4912
            }
4913
            return (int64_t) LIT64( 0x8000000000000000 );
4914
        }
4915
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4916
    }
4917
    else {
4918
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4919
    }
4920
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4921

    
4922
}
4923

    
4924
/*----------------------------------------------------------------------------
4925
| Returns the result of converting the quadruple-precision floating-point
4926
| value `a' to the 64-bit two's complement integer format.  The conversion
4927
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4928
| Arithmetic, except that the conversion is always rounded toward zero.
4929
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4930
| the conversion overflows, the largest integer with the same sign as `a' is
4931
| returned.
4932
*----------------------------------------------------------------------------*/
4933

    
4934
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4935
{
4936
    flag aSign;
4937
    int32 aExp, shiftCount;
4938
    uint64_t aSig0, aSig1;
4939
    int64 z;
4940

    
4941
    aSig1 = extractFloat128Frac1( a );
4942
    aSig0 = extractFloat128Frac0( a );
4943
    aExp = extractFloat128Exp( a );
4944
    aSign = extractFloat128Sign( a );
4945
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4946
    shiftCount = aExp - 0x402F;
4947
    if ( 0 < shiftCount ) {
4948
        if ( 0x403E <= aExp ) {
4949
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4950
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4951
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4952
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4953
            }
4954
            else {
4955
                float_raise( float_flag_invalid STATUS_VAR);
4956
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4957
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4958
                }
4959
            }
4960
            return (int64_t) LIT64( 0x8000000000000000 );
4961
        }
4962
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4963
        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4964
            STATUS(float_exception_flags) |= float_flag_inexact;
4965
        }
4966
    }
4967
    else {
4968
        if ( aExp < 0x3FFF ) {
4969
            if ( aExp | aSig0 | aSig1 ) {
4970
                STATUS(float_exception_flags) |= float_flag_inexact;
4971
            }
4972
            return 0;
4973
        }
4974
        z = aSig0>>( - shiftCount );
4975
        if (    aSig1
4976
             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4977
            STATUS(float_exception_flags) |= float_flag_inexact;
4978
        }
4979
    }
4980
    if ( aSign ) z = - z;
4981
    return z;
4982

    
4983
}
4984

    
4985
/*----------------------------------------------------------------------------
4986
| Returns the result of converting the quadruple-precision floating-point
4987
| value `a' to the single-precision floating-point format.  The conversion
4988
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4989
| Arithmetic.
4990
*----------------------------------------------------------------------------*/
4991

    
4992
float32 float128_to_float32( float128 a STATUS_PARAM )
4993
{
4994
    flag aSign;
4995
    int32 aExp;
4996
    uint64_t aSig0, aSig1;
4997
    uint32_t zSig;
4998

    
4999
    aSig1 = extractFloat128Frac1( a );
5000
    aSig0 = extractFloat128Frac0( a );
5001
    aExp = extractFloat128Exp( a );
5002
    aSign = extractFloat128Sign( a );
5003
    if ( aExp == 0x7FFF ) {
5004
        if ( aSig0 | aSig1 ) {
5005
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5006
        }
5007
        return packFloat32( aSign, 0xFF, 0 );
5008
    }
5009
    aSig0 |= ( aSig1 != 0 );
5010
    shift64RightJamming( aSig0, 18, &aSig0 );
5011
    zSig = aSig0;
5012
    if ( aExp || zSig ) {
5013
        zSig |= 0x40000000;
5014
        aExp -= 0x3F81;
5015
    }
5016
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5017

    
5018
}
5019

    
5020
/*----------------------------------------------------------------------------
5021
| Returns the result of converting the quadruple-precision floating-point
5022
| value `a' to the double-precision floating-point format.  The conversion
5023
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5024
| Arithmetic.
5025
*----------------------------------------------------------------------------*/
5026

    
5027
float64 float128_to_float64( float128 a STATUS_PARAM )
5028
{
5029
    flag aSign;
5030
    int32 aExp;
5031
    uint64_t aSig0, aSig1;
5032

    
5033
    aSig1 = extractFloat128Frac1( a );
5034
    aSig0 = extractFloat128Frac0( a );
5035
    aExp = extractFloat128Exp( a );
5036
    aSign = extractFloat128Sign( a );
5037
    if ( aExp == 0x7FFF ) {
5038
        if ( aSig0 | aSig1 ) {
5039
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5040
        }
5041
        return packFloat64( aSign, 0x7FF, 0 );
5042
    }
5043
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5044
    aSig0 |= ( aSig1 != 0 );
5045
    if ( aExp || aSig0 ) {
5046
        aSig0 |= LIT64( 0x4000000000000000 );
5047
        aExp -= 0x3C01;
5048
    }
5049
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5050

    
5051
}
5052

    
5053
/*----------------------------------------------------------------------------
5054
| Returns the result of converting the quadruple-precision floating-point
5055
| value `a' to the extended double-precision floating-point format.  The
5056
| conversion is performed according to the IEC/IEEE Standard for Binary
5057
| Floating-Point Arithmetic.
5058
*----------------------------------------------------------------------------*/
5059

    
5060
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5061
{
5062
    flag aSign;
5063
    int32 aExp;
5064
    uint64_t aSig0, aSig1;
5065

    
5066
    aSig1 = extractFloat128Frac1( a );
5067
    aSig0 = extractFloat128Frac0( a );
5068
    aExp = extractFloat128Exp( a );
5069
    aSign = extractFloat128Sign( a );
5070
    if ( aExp == 0x7FFF ) {
5071
        if ( aSig0 | aSig1 ) {
5072
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5073
        }
5074
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5075
    }
5076
    if ( aExp == 0 ) {
5077
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5078
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5079
    }
5080
    else {
5081
        aSig0 |= LIT64( 0x0001000000000000 );
5082
    }
5083
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5084
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5085

    
5086
}
5087

    
5088
/*----------------------------------------------------------------------------
5089
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5090
| returns the result as a quadruple-precision floating-point value.  The
5091
| operation is performed according to the IEC/IEEE Standard for Binary
5092
| Floating-Point Arithmetic.
5093
*----------------------------------------------------------------------------*/
5094

    
5095
float128 float128_round_to_int( float128 a STATUS_PARAM )
5096
{
5097
    flag aSign;
5098
    int32 aExp;
5099
    uint64_t lastBitMask, roundBitsMask;
5100
    int8 roundingMode;
5101
    float128 z;
5102

    
5103
    aExp = extractFloat128Exp( a );
5104
    if ( 0x402F <= aExp ) {
5105
        if ( 0x406F <= aExp ) {
5106
            if (    ( aExp == 0x7FFF )
5107
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5108
               ) {
5109
                return propagateFloat128NaN( a, a STATUS_VAR );
5110
            }
5111
            return a;
5112
        }
5113
        lastBitMask = 1;
5114
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5115
        roundBitsMask = lastBitMask - 1;
5116
        z = a;
5117
        roundingMode = STATUS(float_rounding_mode);
5118
        if ( roundingMode == float_round_nearest_even ) {
5119
            if ( lastBitMask ) {
5120
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5121
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5122
            }
5123
            else {
5124
                if ( (int64_t) z.low < 0 ) {
5125
                    ++z.high;
5126
                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5127
                }
5128
            }
5129
        }
5130
        else if ( roundingMode != float_round_to_zero ) {
5131
            if (   extractFloat128Sign( z )
5132
                 ^ ( roundingMode == float_round_up ) ) {
5133
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5134
            }
5135
        }
5136
        z.low &= ~ roundBitsMask;
5137
    }
5138
    else {
5139
        if ( aExp < 0x3FFF ) {
5140
            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5141
            STATUS(float_exception_flags) |= float_flag_inexact;
5142
            aSign = extractFloat128Sign( a );
5143
            switch ( STATUS(float_rounding_mode) ) {
5144
             case float_round_nearest_even:
5145
                if (    ( aExp == 0x3FFE )
5146
                     && (   extractFloat128Frac0( a )
5147
                          | extractFloat128Frac1( a ) )
5148
                   ) {
5149
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5150
                }
5151
                break;
5152
             case float_round_down:
5153
                return
5154
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5155
                    : packFloat128( 0, 0, 0, 0 );
5156
             case float_round_up:
5157
                return
5158
                      aSign ? packFloat128( 1, 0, 0, 0 )
5159
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5160
            }
5161
            return packFloat128( aSign, 0, 0, 0 );
5162
        }
5163
        lastBitMask = 1;
5164
        lastBitMask <<= 0x402F - aExp;
5165
        roundBitsMask = lastBitMask - 1;
5166
        z.low = 0;
5167
        z.high = a.high;
5168
        roundingMode = STATUS(float_rounding_mode);
5169
        if ( roundingMode == float_round_nearest_even ) {
5170
            z.high += lastBitMask>>1;
5171
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5172
                z.high &= ~ lastBitMask;
5173
            }
5174
        }
5175
        else if ( roundingMode != float_round_to_zero ) {
5176
            if (   extractFloat128Sign( z )
5177
                 ^ ( roundingMode == float_round_up ) ) {
5178
                z.high |= ( a.low != 0 );
5179
                z.high += roundBitsMask;
5180
            }
5181
        }
5182
        z.high &= ~ roundBitsMask;
5183
    }
5184
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5185
        STATUS(float_exception_flags) |= float_flag_inexact;
5186
    }
5187
    return z;
5188

    
5189
}
5190

    
5191
/*----------------------------------------------------------------------------
5192
| Returns the result of adding the absolute values of the quadruple-precision
5193
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5194
| before being returned.  `zSign' is ignored if the result is a NaN.
5195
| The addition is performed according to the IEC/IEEE Standard for Binary
5196
| Floating-Point Arithmetic.
5197
*----------------------------------------------------------------------------*/
5198

    
5199
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5200
{
5201
    int32 aExp, bExp, zExp;
5202
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5203
    int32 expDiff;
5204

    
5205
    aSig1 = extractFloat128Frac1( a );
5206
    aSig0 = extractFloat128Frac0( a );
5207
    aExp = extractFloat128Exp( a );
5208
    bSig1 = extractFloat128Frac1( b );
5209
    bSig0 = extractFloat128Frac0( b );
5210
    bExp = extractFloat128Exp( b );
5211
    expDiff = aExp - bExp;
5212
    if ( 0 < expDiff ) {
5213
        if ( aExp == 0x7FFF ) {
5214
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5215
            return a;
5216
        }
5217
        if ( bExp == 0 ) {
5218
            --expDiff;
5219
        }
5220
        else {
5221
            bSig0 |= LIT64( 0x0001000000000000 );
5222
        }
5223
        shift128ExtraRightJamming(
5224
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5225
        zExp = aExp;
5226
    }
5227
    else if ( expDiff < 0 ) {
5228
        if ( bExp == 0x7FFF ) {
5229
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5230
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5231
        }
5232
        if ( aExp == 0 ) {
5233
            ++expDiff;
5234
        }
5235
        else {
5236
            aSig0 |= LIT64( 0x0001000000000000 );
5237
        }
5238
        shift128ExtraRightJamming(
5239
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5240
        zExp = bExp;
5241
    }
5242
    else {
5243
        if ( aExp == 0x7FFF ) {
5244
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5245
                return propagateFloat128NaN( a, b STATUS_VAR );
5246
            }
5247
            return a;
5248
        }
5249
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5250
        if ( aExp == 0 ) {
5251
            if (STATUS(flush_to_zero)) {
5252
                if (zSig0 | zSig1) {
5253
                    float_raise(float_flag_output_denormal STATUS_VAR);
5254
                }
5255
                return packFloat128(zSign, 0, 0, 0);
5256
            }
5257
            return packFloat128( zSign, 0, zSig0, zSig1 );
5258
        }
5259
        zSig2 = 0;
5260
        zSig0 |= LIT64( 0x0002000000000000 );
5261
        zExp = aExp;
5262
        goto shiftRight1;
5263
    }
5264
    aSig0 |= LIT64( 0x0001000000000000 );
5265
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5266
    --zExp;
5267
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5268
    ++zExp;
5269
 shiftRight1:
5270
    shift128ExtraRightJamming(
5271
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5272
 roundAndPack:
5273
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5274

    
5275
}
5276

    
5277
/*----------------------------------------------------------------------------
5278
| Returns the result of subtracting the absolute values of the quadruple-
5279
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5280
| difference is negated before being returned.  `zSign' is ignored if the
5281
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5282
| Standard for Binary Floating-Point Arithmetic.
5283
*----------------------------------------------------------------------------*/
5284

    
5285
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5286
{
5287
    int32 aExp, bExp, zExp;
5288
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5289
    int32 expDiff;
5290
    float128 z;
5291

    
5292
    aSig1 = extractFloat128Frac1( a );
5293
    aSig0 = extractFloat128Frac0( a );
5294
    aExp = extractFloat128Exp( a );
5295
    bSig1 = extractFloat128Frac1( b );
5296
    bSig0 = extractFloat128Frac0( b );
5297
    bExp = extractFloat128Exp( b );
5298
    expDiff = aExp - bExp;
5299
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5300
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5301
    if ( 0 < expDiff ) goto aExpBigger;
5302
    if ( expDiff < 0 ) goto bExpBigger;
5303
    if ( aExp == 0x7FFF ) {
5304
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5305
            return propagateFloat128NaN( a, b STATUS_VAR );
5306
        }
5307
        float_raise( float_flag_invalid STATUS_VAR);
5308
        z.low = float128_default_nan_low;
5309
        z.high = float128_default_nan_high;
5310
        return z;
5311
    }
5312
    if ( aExp == 0 ) {
5313
        aExp = 1;
5314
        bExp = 1;
5315
    }
5316
    if ( bSig0 < aSig0 ) goto aBigger;
5317
    if ( aSig0 < bSig0 ) goto bBigger;
5318
    if ( bSig1 < aSig1 ) goto aBigger;
5319
    if ( aSig1 < bSig1 ) goto bBigger;
5320
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5321
 bExpBigger:
5322
    if ( bExp == 0x7FFF ) {
5323
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5324
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5325
    }
5326
    if ( aExp == 0 ) {
5327
        ++expDiff;
5328
    }
5329
    else {
5330
        aSig0 |= LIT64( 0x4000000000000000 );
5331
    }
5332
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5333
    bSig0 |= LIT64( 0x4000000000000000 );
5334
 bBigger:
5335
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5336
    zExp = bExp;
5337
    zSign ^= 1;
5338
    goto normalizeRoundAndPack;
5339
 aExpBigger:
5340
    if ( aExp == 0x7FFF ) {
5341
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5342
        return a;
5343
    }
5344
    if ( bExp == 0 ) {
5345
        --expDiff;
5346
    }
5347
    else {
5348
        bSig0 |= LIT64( 0x4000000000000000 );
5349
    }
5350
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5351
    aSig0 |= LIT64( 0x4000000000000000 );
5352
 aBigger:
5353
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5354
    zExp = aExp;
5355
 normalizeRoundAndPack:
5356
    --zExp;
5357
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5358

    
5359
}
5360

    
5361
/*----------------------------------------------------------------------------
5362
| Returns the result of adding the quadruple-precision floating-point values
5363
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5364
| for Binary Floating-Point Arithmetic.
5365
*----------------------------------------------------------------------------*/
5366

    
5367
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5368
{
5369
    flag aSign, bSign;
5370

    
5371
    aSign = extractFloat128Sign( a );
5372
    bSign = extractFloat128Sign( b );
5373
    if ( aSign == bSign ) {
5374
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5375
    }
5376
    else {
5377
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5378
    }
5379

    
5380
}
5381

    
5382
/*----------------------------------------------------------------------------
5383
| Returns the result of subtracting the quadruple-precision floating-point
5384
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5385
| Standard for Binary Floating-Point Arithmetic.
5386
*----------------------------------------------------------------------------*/
5387

    
5388
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5389
{
5390
    flag aSign, bSign;
5391

    
5392
    aSign = extractFloat128Sign( a );
5393
    bSign = extractFloat128Sign( b );
5394
    if ( aSign == bSign ) {
5395
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5396
    }
5397
    else {
5398
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5399
    }
5400

    
5401
}
5402

    
5403
/*----------------------------------------------------------------------------
5404
| Returns the result of multiplying the quadruple-precision floating-point
5405
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5406
| Standard for Binary Floating-Point Arithmetic.
5407
*----------------------------------------------------------------------------*/
5408

    
5409
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5410
{
5411
    flag aSign, bSign, zSign;
5412
    int32 aExp, bExp, zExp;
5413
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5414
    float128 z;
5415

    
5416
    aSig1 = extractFloat128Frac1( a );
5417
    aSig0 = extractFloat128Frac0( a );
5418
    aExp = extractFloat128Exp( a );
5419
    aSign = extractFloat128Sign( a );
5420
    bSig1 = extractFloat128Frac1( b );
5421
    bSig0 = extractFloat128Frac0( b );
5422
    bExp = extractFloat128Exp( b );
5423
    bSign = extractFloat128Sign( b );
5424
    zSign = aSign ^ bSign;
5425
    if ( aExp == 0x7FFF ) {
5426
        if (    ( aSig0 | aSig1 )
5427
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5428
            return propagateFloat128NaN( a, b STATUS_VAR );
5429
        }
5430
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5431
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5432
    }
5433
    if ( bExp == 0x7FFF ) {
5434
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5435
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5436
 invalid:
5437
            float_raise( float_flag_invalid STATUS_VAR);
5438
            z.low = float128_default_nan_low;
5439
            z.high = float128_default_nan_high;
5440
            return z;
5441
        }
5442
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5443
    }
5444
    if ( aExp == 0 ) {
5445
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5446
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5447
    }
5448
    if ( bExp == 0 ) {
5449
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5450
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5451
    }
5452
    zExp = aExp + bExp - 0x4000;
5453
    aSig0 |= LIT64( 0x0001000000000000 );
5454
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5455
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5456
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5457
    zSig2 |= ( zSig3 != 0 );
5458
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5459
        shift128ExtraRightJamming(
5460
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5461
        ++zExp;
5462
    }
5463
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5464

    
5465
}
5466

    
5467
/*----------------------------------------------------------------------------
5468
| Returns the result of dividing the quadruple-precision floating-point value
5469
| `a' by the corresponding value `b'.  The operation is performed according to
5470
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5471
*----------------------------------------------------------------------------*/
5472

    
5473
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5474
{
5475
    flag aSign, bSign, zSign;
5476
    int32 aExp, bExp, zExp;
5477
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5478
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5479
    float128 z;
5480

    
5481
    aSig1 = extractFloat128Frac1( a );
5482
    aSig0 = extractFloat128Frac0( a );
5483
    aExp = extractFloat128Exp( a );
5484
    aSign = extractFloat128Sign( a );
5485
    bSig1 = extractFloat128Frac1( b );
5486
    bSig0 = extractFloat128Frac0( b );
5487
    bExp = extractFloat128Exp( b );
5488
    bSign = extractFloat128Sign( b );
5489
    zSign = aSign ^ bSign;
5490
    if ( aExp == 0x7FFF ) {
5491
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5492
        if ( bExp == 0x7FFF ) {
5493
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5494
            goto invalid;
5495
        }
5496
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5497
    }
5498
    if ( bExp == 0x7FFF ) {
5499
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5500
        return packFloat128( zSign, 0, 0, 0 );
5501
    }
5502
    if ( bExp == 0 ) {
5503
        if ( ( bSig0 | bSig1 ) == 0 ) {
5504
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5505
 invalid:
5506
                float_raise( float_flag_invalid STATUS_VAR);
5507
                z.low = float128_default_nan_low;
5508
                z.high = float128_default_nan_high;
5509
                return z;
5510
            }
5511
            float_raise( float_flag_divbyzero STATUS_VAR);
5512
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5513
        }
5514
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5515
    }
5516
    if ( aExp == 0 ) {
5517
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5518
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5519
    }
5520
    zExp = aExp - bExp + 0x3FFD;
5521
    shortShift128Left(
5522
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5523
    shortShift128Left(
5524
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5525
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5526
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5527
        ++zExp;
5528
    }
5529
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5530
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5531
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5532
    while ( (int64_t) rem0 < 0 ) {
5533
        --zSig0;
5534
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5535
    }
5536
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5537
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5538
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5539
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5540
        while ( (int64_t) rem1 < 0 ) {
5541
            --zSig1;
5542
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5543
        }
5544
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5545
    }
5546
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5547
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5548

    
5549
}
5550

    
5551
/*----------------------------------------------------------------------------
5552
| Returns the remainder of the quadruple-precision floating-point value `a'
5553
| with respect to the corresponding value `b'.  The operation is performed
5554
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555
*----------------------------------------------------------------------------*/
5556

    
5557
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5558
{
5559
    flag aSign, zSign;
5560
    int32 aExp, bExp, expDiff;
5561
    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5562
    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5563
    int64_t sigMean0;
5564
    float128 z;
5565

    
5566
    aSig1 = extractFloat128Frac1( a );
5567
    aSig0 = extractFloat128Frac0( a );
5568
    aExp = extractFloat128Exp( a );
5569
    aSign = extractFloat128Sign( a );
5570
    bSig1 = extractFloat128Frac1( b );
5571
    bSig0 = extractFloat128Frac0( b );
5572
    bExp = extractFloat128Exp( b );
5573
    if ( aExp == 0x7FFF ) {
5574
        if (    ( aSig0 | aSig1 )
5575
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5576
            return propagateFloat128NaN( a, b STATUS_VAR );
5577
        }
5578
        goto invalid;
5579
    }
5580
    if ( bExp == 0x7FFF ) {
5581
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5582
        return a;
5583
    }
5584
    if ( bExp == 0 ) {
5585
        if ( ( bSig0 | bSig1 ) == 0 ) {
5586
 invalid:
5587
            float_raise( float_flag_invalid STATUS_VAR);
5588
            z.low = float128_default_nan_low;
5589
            z.high = float128_default_nan_high;
5590
            return z;
5591
        }
5592
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5593
    }
5594
    if ( aExp == 0 ) {
5595
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5596
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5597
    }
5598
    expDiff = aExp - bExp;
5599
    if ( expDiff < -1 ) return a;
5600
    shortShift128Left(
5601
        aSig0 | LIT64( 0x0001000000000000 ),
5602
        aSig1,
5603
        15 - ( expDiff < 0 ),
5604
        &aSig0,
5605
        &aSig1
5606
    );
5607
    shortShift128Left(
5608
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5609
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5610
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5611
    expDiff -= 64;
5612
    while ( 0 < expDiff ) {
5613
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5614
        q = ( 4 < q ) ? q - 4 : 0;
5615
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5616
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5617
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5618
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5619
        expDiff -= 61;
5620
    }
5621
    if ( -64 < expDiff ) {
5622
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5623
        q = ( 4 < q ) ? q - 4 : 0;
5624
        q >>= - expDiff;
5625
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5626
        expDiff += 52;
5627
        if ( expDiff < 0 ) {
5628
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5629
        }
5630
        else {
5631
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5632
        }
5633
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5634
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5635
    }
5636
    else {
5637
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5638
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5639
    }
5640
    do {
5641
        alternateASig0 = aSig0;
5642
        alternateASig1 = aSig1;
5643
        ++q;
5644
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5645
    } while ( 0 <= (int64_t) aSig0 );
5646
    add128(
5647
        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5648
    if (    ( sigMean0 < 0 )
5649
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5650
        aSig0 = alternateASig0;
5651
        aSig1 = alternateASig1;
5652
    }
5653
    zSign = ( (int64_t) aSig0 < 0 );
5654
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5655
    return
5656
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5657

    
5658
}
5659

    
5660
/*----------------------------------------------------------------------------
5661
| Returns the square root of the quadruple-precision floating-point value `a'.
5662
| The operation is performed according to the IEC/IEEE Standard for Binary
5663
| Floating-Point Arithmetic.
5664
*----------------------------------------------------------------------------*/
5665

    
5666
float128 float128_sqrt( float128 a STATUS_PARAM )
5667
{
5668
    flag aSign;
5669
    int32 aExp, zExp;
5670
    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5671
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5672
    float128 z;
5673

    
5674
    aSig1 = extractFloat128Frac1( a );
5675
    aSig0 = extractFloat128Frac0( a );
5676
    aExp = extractFloat128Exp( a );
5677
    aSign = extractFloat128Sign( a );
5678
    if ( aExp == 0x7FFF ) {
5679
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5680
        if ( ! aSign ) return a;
5681
        goto invalid;
5682
    }
5683
    if ( aSign ) {
5684
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5685
 invalid:
5686
        float_raise( float_flag_invalid STATUS_VAR);
5687
        z.low = float128_default_nan_low;
5688
        z.high = float128_default_nan_high;
5689
        return z;
5690
    }
5691
    if ( aExp == 0 ) {
5692
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5693
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5694
    }
5695
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5696
    aSig0 |= LIT64( 0x0001000000000000 );
5697
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5698
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5699
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5700
    doubleZSig0 = zSig0<<1;
5701
    mul64To128( zSig0, zSig0, &term0, &term1 );
5702
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5703
    while ( (int64_t) rem0 < 0 ) {
5704
        --zSig0;
5705
        doubleZSig0 -= 2;
5706
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5707
    }
5708
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5709
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5710
        if ( zSig1 == 0 ) zSig1 = 1;
5711
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5712
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5713
        mul64To128( zSig1, zSig1, &term2, &term3 );
5714
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5715
        while ( (int64_t) rem1 < 0 ) {
5716
            --zSig1;
5717
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5718
            term3 |= 1;
5719
            term2 |= doubleZSig0;
5720
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5721
        }
5722
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5723
    }
5724
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5725
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5726

    
5727
}
5728

    
5729
/*----------------------------------------------------------------------------
5730
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5731
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5732
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5733
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5734
*----------------------------------------------------------------------------*/
5735

    
5736
int float128_eq( float128 a, float128 b STATUS_PARAM )
5737
{
5738

    
5739
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5740
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5741
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5742
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5743
       ) {
5744
        float_raise( float_flag_invalid STATUS_VAR);
5745
        return 0;
5746
    }
5747
    return
5748
           ( a.low == b.low )
5749
        && (    ( a.high == b.high )
5750
             || (    ( a.low == 0 )
5751
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5752
           );
5753

    
5754
}
5755

    
5756
/*----------------------------------------------------------------------------
5757
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5758
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
5759
| exception is raised if either operand is a NaN.  The comparison is performed
5760
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5761
*----------------------------------------------------------------------------*/
5762

    
5763
int float128_le( float128 a, float128 b STATUS_PARAM )
5764
{
5765
    flag aSign, bSign;
5766

    
5767
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5768
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5769
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5770
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5771
       ) {
5772
        float_raise( float_flag_invalid STATUS_VAR);
5773
        return 0;
5774
    }
5775
    aSign = extractFloat128Sign( a );
5776
    bSign = extractFloat128Sign( b );
5777
    if ( aSign != bSign ) {
5778
        return
5779
               aSign
5780
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5781
                 == 0 );
5782
    }
5783
    return
5784
          aSign ? le128( b.high, b.low, a.high, a.low )
5785
        : le128( a.high, a.low, b.high, b.low );
5786

    
5787
}
5788

    
5789
/*----------------------------------------------------------------------------
5790
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5791
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5792
| raised if either operand is a NaN.  The comparison is performed according
5793
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794
*----------------------------------------------------------------------------*/
5795

    
5796
int float128_lt( float128 a, float128 b STATUS_PARAM )
5797
{
5798
    flag aSign, bSign;
5799

    
5800
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5801
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5802
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5803
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5804
       ) {
5805
        float_raise( float_flag_invalid STATUS_VAR);
5806
        return 0;
5807
    }
5808
    aSign = extractFloat128Sign( a );
5809
    bSign = extractFloat128Sign( b );
5810
    if ( aSign != bSign ) {
5811
        return
5812
               aSign
5813
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5814
                 != 0 );
5815
    }
5816
    return
5817
          aSign ? lt128( b.high, b.low, a.high, a.low )
5818
        : lt128( a.high, a.low, b.high, b.low );
5819

    
5820
}
5821

    
5822
/*----------------------------------------------------------------------------
5823
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5824
| be compared, and 0 otherwise.  The invalid exception is raised if either
5825
| operand is a NaN. The comparison is performed according to the IEC/IEEE
5826
| Standard for Binary Floating-Point Arithmetic.
5827
*----------------------------------------------------------------------------*/
5828

    
5829
int float128_unordered( float128 a, float128 b STATUS_PARAM )
5830
{
5831
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5832
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5833
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5834
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5835
       ) {
5836
        float_raise( float_flag_invalid STATUS_VAR);
5837
        return 1;
5838
    }
5839
    return 0;
5840
}
5841

    
5842
/*----------------------------------------------------------------------------
5843
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5844
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5845
| exception.  The comparison is performed according to the IEC/IEEE Standard
5846
| for Binary Floating-Point Arithmetic.
5847
*----------------------------------------------------------------------------*/
5848

    
5849
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5850
{
5851

    
5852
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5853
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5854
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5855
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5856
       ) {
5857
        if (    float128_is_signaling_nan( a )
5858
             || float128_is_signaling_nan( b ) ) {
5859
            float_raise( float_flag_invalid STATUS_VAR);
5860
        }
5861
        return 0;
5862
    }
5863
    return
5864
           ( a.low == b.low )
5865
        && (    ( a.high == b.high )
5866
             || (    ( a.low == 0 )
5867
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5868
           );
5869

    
5870
}
5871

    
5872
/*----------------------------------------------------------------------------
5873
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5874
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5875
| cause an exception.  Otherwise, the comparison is performed according to the
5876
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5877
*----------------------------------------------------------------------------*/
5878

    
5879
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5880
{
5881
    flag aSign, bSign;
5882

    
5883
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5884
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5885
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5886
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5887
       ) {
5888
        if (    float128_is_signaling_nan( a )
5889
             || float128_is_signaling_nan( b ) ) {
5890
            float_raise( float_flag_invalid STATUS_VAR);
5891
        }
5892
        return 0;
5893
    }
5894
    aSign = extractFloat128Sign( a );
5895
    bSign = extractFloat128Sign( b );
5896
    if ( aSign != bSign ) {
5897
        return
5898
               aSign
5899
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5900
                 == 0 );
5901
    }
5902
    return
5903
          aSign ? le128( b.high, b.low, a.high, a.low )
5904
        : le128( a.high, a.low, b.high, b.low );
5905

    
5906
}
5907

    
5908
/*----------------------------------------------------------------------------
5909
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5910
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5911
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5912
| Standard for Binary Floating-Point Arithmetic.
5913
*----------------------------------------------------------------------------*/
5914

    
5915
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5916
{
5917
    flag aSign, bSign;
5918

    
5919
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5920
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5921
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5922
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5923
       ) {
5924
        if (    float128_is_signaling_nan( a )
5925
             || float128_is_signaling_nan( b ) ) {
5926
            float_raise( float_flag_invalid STATUS_VAR);
5927
        }
5928
        return 0;
5929
    }
5930
    aSign = extractFloat128Sign( a );
5931
    bSign = extractFloat128Sign( b );
5932
    if ( aSign != bSign ) {
5933
        return
5934
               aSign
5935
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5936
                 != 0 );
5937
    }
5938
    return
5939
          aSign ? lt128( b.high, b.low, a.high, a.low )
5940
        : lt128( a.high, a.low, b.high, b.low );
5941

    
5942
}
5943

    
5944
/*----------------------------------------------------------------------------
5945
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5946
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5947
| comparison is performed according to the IEC/IEEE Standard for Binary
5948
| Floating-Point Arithmetic.
5949
*----------------------------------------------------------------------------*/
5950

    
5951
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5952
{
5953
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5954
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5955
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5956
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5957
       ) {
5958
        if (    float128_is_signaling_nan( a )
5959
             || float128_is_signaling_nan( b ) ) {
5960
            float_raise( float_flag_invalid STATUS_VAR);
5961
        }
5962
        return 1;
5963
    }
5964
    return 0;
5965
}
5966

    
5967
/* misc functions */
5968
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5969
{
5970
    return int64_to_float32(a STATUS_VAR);
5971
}
5972

    
5973
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5974
{
5975
    return int64_to_float64(a STATUS_VAR);
5976
}
5977

    
5978
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5979
{
5980
    int64_t v;
5981
    unsigned int res;
5982

    
5983
    v = float32_to_int64(a STATUS_VAR);
5984
    if (v < 0) {
5985
        res = 0;
5986
        float_raise( float_flag_invalid STATUS_VAR);
5987
    } else if (v > 0xffffffff) {
5988
        res = 0xffffffff;
5989
        float_raise( float_flag_invalid STATUS_VAR);
5990
    } else {
5991
        res = v;
5992
    }
5993
    return res;
5994
}
5995

    
5996
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5997
{
5998
    int64_t v;
5999
    unsigned int res;
6000

    
6001
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6002
    if (v < 0) {
6003
        res = 0;
6004
        float_raise( float_flag_invalid STATUS_VAR);
6005
    } else if (v > 0xffffffff) {
6006
        res = 0xffffffff;
6007
        float_raise( float_flag_invalid STATUS_VAR);
6008
    } else {
6009
        res = v;
6010
    }
6011
    return res;
6012
}
6013

    
6014
unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6015
{
6016
    int64_t v;
6017
    unsigned int res;
6018

    
6019
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6020
    if (v < 0) {
6021
        res = 0;
6022
        float_raise( float_flag_invalid STATUS_VAR);
6023
    } else if (v > 0xffff) {
6024
        res = 0xffff;
6025
        float_raise( float_flag_invalid STATUS_VAR);
6026
    } else {
6027
        res = v;
6028
    }
6029
    return res;
6030
}
6031

    
6032
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6033
{
6034
    int64_t v;
6035
    unsigned int res;
6036

    
6037
    v = float64_to_int64(a STATUS_VAR);
6038
    if (v < 0) {
6039
        res = 0;
6040
        float_raise( float_flag_invalid STATUS_VAR);
6041
    } else if (v > 0xffffffff) {
6042
        res = 0xffffffff;
6043
        float_raise( float_flag_invalid STATUS_VAR);
6044
    } else {
6045
        res = v;
6046
    }
6047
    return res;
6048
}
6049

    
6050
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6051
{
6052
    int64_t v;
6053
    unsigned int res;
6054

    
6055
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6056
    if (v < 0) {
6057
        res = 0;
6058
        float_raise( float_flag_invalid STATUS_VAR);
6059
    } else if (v > 0xffffffff) {
6060
        res = 0xffffffff;
6061
        float_raise( float_flag_invalid STATUS_VAR);
6062
    } else {
6063
        res = v;
6064
    }
6065
    return res;
6066
}
6067

    
6068
unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6069
{
6070
    int64_t v;
6071
    unsigned int res;
6072

    
6073
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6074
    if (v < 0) {
6075
        res = 0;
6076
        float_raise( float_flag_invalid STATUS_VAR);
6077
    } else if (v > 0xffff) {
6078
        res = 0xffff;
6079
        float_raise( float_flag_invalid STATUS_VAR);
6080
    } else {
6081
        res = v;
6082
    }
6083
    return res;
6084
}
6085

    
6086
/* FIXME: This looks broken.  */
6087
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6088
{
6089
    int64_t v;
6090

    
6091
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6092
    v += float64_val(a);
6093
    v = float64_to_int64(make_float64(v) STATUS_VAR);
6094

    
6095
    return v - INT64_MIN;
6096
}
6097

    
6098
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6099
{
6100
    int64_t v;
6101

    
6102
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6103
    v += float64_val(a);
6104
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6105

    
6106
    return v - INT64_MIN;
6107
}
6108

    
6109
#define COMPARE(s, nan_exp)                                                  \
6110
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6111
                                      int is_quiet STATUS_PARAM )            \
6112
{                                                                            \
6113
    flag aSign, bSign;                                                       \
6114
    uint ## s ## _t av, bv;                                                  \
6115
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6116
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6117
                                                                             \
6118
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6119
         extractFloat ## s ## Frac( a ) ) ||                                 \
6120
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6121
          extractFloat ## s ## Frac( b ) )) {                                \
6122
        if (!is_quiet ||                                                     \
6123
            float ## s ## _is_signaling_nan( a ) ||                          \
6124
            float ## s ## _is_signaling_nan( b ) ) {                         \
6125
            float_raise( float_flag_invalid STATUS_VAR);                     \
6126
        }                                                                    \
6127
        return float_relation_unordered;                                     \
6128
    }                                                                        \
6129
    aSign = extractFloat ## s ## Sign( a );                                  \
6130
    bSign = extractFloat ## s ## Sign( b );                                  \
6131
    av = float ## s ## _val(a);                                              \
6132
    bv = float ## s ## _val(b);                                              \
6133
    if ( aSign != bSign ) {                                                  \
6134
        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6135
            /* zero case */                                                  \
6136
            return float_relation_equal;                                     \
6137
        } else {                                                             \
6138
            return 1 - (2 * aSign);                                          \
6139
        }                                                                    \
6140
    } else {                                                                 \
6141
        if (av == bv) {                                                      \
6142
            return float_relation_equal;                                     \
6143
        } else {                                                             \
6144
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
6145
        }                                                                    \
6146
    }                                                                        \
6147
}                                                                            \
6148
                                                                             \
6149
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6150
{                                                                            \
6151
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6152
}                                                                            \
6153
                                                                             \
6154
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6155
{                                                                            \
6156
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6157
}
6158

    
6159
COMPARE(32, 0xff)
6160
COMPARE(64, 0x7ff)
6161

    
6162
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6163
                                      int is_quiet STATUS_PARAM )
6164
{
6165
    flag aSign, bSign;
6166

    
6167
    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6168
          ( extractFloatx80Frac( a )<<1 ) ) ||
6169
        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6170
          ( extractFloatx80Frac( b )<<1 ) )) {
6171
        if (!is_quiet ||
6172
            floatx80_is_signaling_nan( a ) ||
6173
            floatx80_is_signaling_nan( b ) ) {
6174
            float_raise( float_flag_invalid STATUS_VAR);
6175
        }
6176
        return float_relation_unordered;
6177
    }
6178
    aSign = extractFloatx80Sign( a );
6179
    bSign = extractFloatx80Sign( b );
6180
    if ( aSign != bSign ) {
6181

    
6182
        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6183
             ( ( a.low | b.low ) == 0 ) ) {
6184
            /* zero case */
6185
            return float_relation_equal;
6186
        } else {
6187
            return 1 - (2 * aSign);
6188
        }
6189
    } else {
6190
        if (a.low == b.low && a.high == b.high) {
6191
            return float_relation_equal;
6192
        } else {
6193
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6194
        }
6195
    }
6196
}
6197

    
6198
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6199
{
6200
    return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6201
}
6202

    
6203
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6204
{
6205
    return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6206
}
6207

    
6208
INLINE int float128_compare_internal( float128 a, float128 b,
6209
                                      int is_quiet STATUS_PARAM )
6210
{
6211
    flag aSign, bSign;
6212

    
6213
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6214
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6215
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6216
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6217
        if (!is_quiet ||
6218
            float128_is_signaling_nan( a ) ||
6219
            float128_is_signaling_nan( b ) ) {
6220
            float_raise( float_flag_invalid STATUS_VAR);
6221
        }
6222
        return float_relation_unordered;
6223
    }
6224
    aSign = extractFloat128Sign( a );
6225
    bSign = extractFloat128Sign( b );
6226
    if ( aSign != bSign ) {
6227
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6228
            /* zero case */
6229
            return float_relation_equal;
6230
        } else {
6231
            return 1 - (2 * aSign);
6232
        }
6233
    } else {
6234
        if (a.low == b.low && a.high == b.high) {
6235
            return float_relation_equal;
6236
        } else {
6237
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6238
        }
6239
    }
6240
}
6241

    
6242
int float128_compare( float128 a, float128 b STATUS_PARAM )
6243
{
6244
    return float128_compare_internal(a, b, 0 STATUS_VAR);
6245
}
6246

    
6247
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6248
{
6249
    return float128_compare_internal(a, b, 1 STATUS_VAR);
6250
}
6251

    
6252
/* min() and max() functions. These can't be implemented as
6253
 * 'compare and pick one input' because that would mishandle
6254
 * NaNs and +0 vs -0.
6255
 */
6256
#define MINMAX(s, nan_exp)                                              \
6257
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6258
                                        int ismin STATUS_PARAM )        \
6259
{                                                                       \
6260
    flag aSign, bSign;                                                  \
6261
    uint ## s ## _t av, bv;                                             \
6262
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6263
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6264
    if (float ## s ## _is_any_nan(a) ||                                 \
6265
        float ## s ## _is_any_nan(b)) {                                 \
6266
        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6267
    }                                                                   \
6268
    aSign = extractFloat ## s ## Sign(a);                               \
6269
    bSign = extractFloat ## s ## Sign(b);                               \
6270
    av = float ## s ## _val(a);                                         \
6271
    bv = float ## s ## _val(b);                                         \
6272
    if (aSign != bSign) {                                               \
6273
        if (ismin) {                                                    \
6274
            return aSign ? a : b;                                       \
6275
        } else {                                                        \
6276
            return aSign ? b : a;                                       \
6277
        }                                                               \
6278
    } else {                                                            \
6279
        if (ismin) {                                                    \
6280
            return (aSign ^ (av < bv)) ? a : b;                         \
6281
        } else {                                                        \
6282
            return (aSign ^ (av < bv)) ? b : a;                         \
6283
        }                                                               \
6284
    }                                                                   \
6285
}                                                                       \
6286
                                                                        \
6287
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6288
{                                                                       \
6289
    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6290
}                                                                       \
6291
                                                                        \
6292
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6293
{                                                                       \
6294
    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6295
}
6296

    
6297
MINMAX(32, 0xff)
6298
MINMAX(64, 0x7ff)
6299

    
6300

    
6301
/* Multiply A by 2 raised to the power N.  */
6302
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6303
{
6304
    flag aSign;
6305
    int16_t aExp;
6306
    uint32_t aSig;
6307

    
6308
    a = float32_squash_input_denormal(a STATUS_VAR);
6309
    aSig = extractFloat32Frac( a );
6310
    aExp = extractFloat32Exp( a );
6311
    aSign = extractFloat32Sign( a );
6312

    
6313
    if ( aExp == 0xFF ) {
6314
        if ( aSig ) {
6315
            return propagateFloat32NaN( a, a STATUS_VAR );
6316
        }
6317
        return a;
6318
    }
6319
    if ( aExp != 0 )
6320
        aSig |= 0x00800000;
6321
    else if ( aSig == 0 )
6322
        return a;
6323

    
6324
    if (n > 0x200) {
6325
        n = 0x200;
6326
    } else if (n < -0x200) {
6327
        n = -0x200;
6328
    }
6329

    
6330
    aExp += n - 1;
6331
    aSig <<= 7;
6332
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6333
}
6334

    
6335
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6336
{
6337
    flag aSign;
6338
    int16_t aExp;
6339
    uint64_t aSig;
6340

    
6341
    a = float64_squash_input_denormal(a STATUS_VAR);
6342
    aSig = extractFloat64Frac( a );
6343
    aExp = extractFloat64Exp( a );
6344
    aSign = extractFloat64Sign( a );
6345

    
6346
    if ( aExp == 0x7FF ) {
6347
        if ( aSig ) {
6348
            return propagateFloat64NaN( a, a STATUS_VAR );
6349
        }
6350
        return a;
6351
    }
6352
    if ( aExp != 0 )
6353
        aSig |= LIT64( 0x0010000000000000 );
6354
    else if ( aSig == 0 )
6355
        return a;
6356

    
6357
    if (n > 0x1000) {
6358
        n = 0x1000;
6359
    } else if (n < -0x1000) {
6360
        n = -0x1000;
6361
    }
6362

    
6363
    aExp += n - 1;
6364
    aSig <<= 10;
6365
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6366
}
6367

    
6368
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6369
{
6370
    flag aSign;
6371
    int32_t aExp;
6372
    uint64_t aSig;
6373

    
6374
    aSig = extractFloatx80Frac( a );
6375
    aExp = extractFloatx80Exp( a );
6376
    aSign = extractFloatx80Sign( a );
6377

    
6378
    if ( aExp == 0x7FFF ) {
6379
        if ( aSig<<1 ) {
6380
            return propagateFloatx80NaN( a, a STATUS_VAR );
6381
        }
6382
        return a;
6383
    }
6384

    
6385
    if (aExp == 0 && aSig == 0)
6386
        return a;
6387

    
6388
    if (n > 0x10000) {
6389
        n = 0x10000;
6390
    } else if (n < -0x10000) {
6391
        n = -0x10000;
6392
    }
6393

    
6394
    aExp += n;
6395
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6396
                                          aSign, aExp, aSig, 0 STATUS_VAR );
6397
}
6398

    
6399
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6400
{
6401
    flag aSign;
6402
    int32_t aExp;
6403
    uint64_t aSig0, aSig1;
6404

    
6405
    aSig1 = extractFloat128Frac1( a );
6406
    aSig0 = extractFloat128Frac0( a );
6407
    aExp = extractFloat128Exp( a );
6408
    aSign = extractFloat128Sign( a );
6409
    if ( aExp == 0x7FFF ) {
6410
        if ( aSig0 | aSig1 ) {
6411
            return propagateFloat128NaN( a, a STATUS_VAR );
6412
        }
6413
        return a;
6414
    }
6415
    if ( aExp != 0 )
6416
        aSig0 |= LIT64( 0x0001000000000000 );
6417
    else if ( aSig0 == 0 && aSig1 == 0 )
6418
        return a;
6419

    
6420
    if (n > 0x10000) {
6421
        n = 0x10000;
6422
    } else if (n < -0x10000) {
6423
        n = -0x10000;
6424
    }
6425

    
6426
    aExp += n - 1;
6427
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6428
                                          STATUS_VAR );
6429

    
6430
}