Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 5aea4c58

History | View | Annotate | Download (243.3 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
/* softfloat (and in particular the code in softfloat-specialize.h) is
39
 * target-dependent and needs the TARGET_* macros.
40
 */
41
#include "config.h"
42

    
43
#include "softfloat.h"
44

    
45
/*----------------------------------------------------------------------------
46
| Primitive arithmetic functions, including multi-word arithmetic, and
47
| division and square root approximations.  (Can be specialized to target if
48
| desired.)
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-macros.h"
51

    
52
/*----------------------------------------------------------------------------
53
| Functions and definitions to determine:  (1) whether tininess for underflow
54
| is detected before or after rounding by default, (2) what (if anything)
55
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
56
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57
| are propagated from function inputs to output.  These details are target-
58
| specific.
59
*----------------------------------------------------------------------------*/
60
#include "softfloat-specialize.h"
61

    
62
void set_float_rounding_mode(int val STATUS_PARAM)
63
{
64
    STATUS(float_rounding_mode) = val;
65
}
66

    
67
void set_float_exception_flags(int val STATUS_PARAM)
68
{
69
    STATUS(float_exception_flags) = val;
70
}
71

    
72
void set_floatx80_rounding_precision(int val STATUS_PARAM)
73
{
74
    STATUS(floatx80_rounding_precision) = val;
75
}
76

    
77
/*----------------------------------------------------------------------------
78
| Returns the fraction bits of the half-precision floating-point value `a'.
79
*----------------------------------------------------------------------------*/
80

    
81
INLINE uint32_t extractFloat16Frac(float16 a)
82
{
83
    return float16_val(a) & 0x3ff;
84
}
85

    
86
/*----------------------------------------------------------------------------
87
| Returns the exponent bits of the half-precision floating-point value `a'.
88
*----------------------------------------------------------------------------*/
89

    
90
INLINE int16 extractFloat16Exp(float16 a)
91
{
92
    return (float16_val(a) >> 10) & 0x1f;
93
}
94

    
95
/*----------------------------------------------------------------------------
96
| Returns the sign bit of the single-precision floating-point value `a'.
97
*----------------------------------------------------------------------------*/
98

    
99
INLINE flag extractFloat16Sign(float16 a)
100
{
101
    return float16_val(a)>>15;
102
}
103

    
104
/*----------------------------------------------------------------------------
105
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
106
| and 7, and returns the properly rounded 32-bit integer corresponding to the
107
| input.  If `zSign' is 1, the input is negated before being converted to an
108
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
109
| is simply rounded to an integer, with the inexact exception raised if the
110
| input cannot be represented exactly as an integer.  However, if the fixed-
111
| point input is too large, the invalid exception is raised and the largest
112
| positive or negative integer is returned.
113
*----------------------------------------------------------------------------*/
114

    
115
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
116
{
117
    int8 roundingMode;
118
    flag roundNearestEven;
119
    int8 roundIncrement, roundBits;
120
    int32_t z;
121

    
122
    roundingMode = STATUS(float_rounding_mode);
123
    roundNearestEven = ( roundingMode == float_round_nearest_even );
124
    roundIncrement = 0x40;
125
    if ( ! roundNearestEven ) {
126
        if ( roundingMode == float_round_to_zero ) {
127
            roundIncrement = 0;
128
        }
129
        else {
130
            roundIncrement = 0x7F;
131
            if ( zSign ) {
132
                if ( roundingMode == float_round_up ) roundIncrement = 0;
133
            }
134
            else {
135
                if ( roundingMode == float_round_down ) roundIncrement = 0;
136
            }
137
        }
138
    }
139
    roundBits = absZ & 0x7F;
140
    absZ = ( absZ + roundIncrement )>>7;
141
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
142
    z = absZ;
143
    if ( zSign ) z = - z;
144
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
145
        float_raise( float_flag_invalid STATUS_VAR);
146
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
147
    }
148
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
149
    return z;
150

    
151
}
152

    
153
/*----------------------------------------------------------------------------
154
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155
| `absZ1', with binary point between bits 63 and 64 (between the input words),
156
| and returns the properly rounded 64-bit integer corresponding to the input.
157
| If `zSign' is 1, the input is negated before being converted to an integer.
158
| Ordinarily, the fixed-point input is simply rounded to an integer, with
159
| the inexact exception raised if the input cannot be represented exactly as
160
| an integer.  However, if the fixed-point input is too large, the invalid
161
| exception is raised and the largest positive or negative integer is
162
| returned.
163
*----------------------------------------------------------------------------*/
164

    
165
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
166
{
167
    int8 roundingMode;
168
    flag roundNearestEven, increment;
169
    int64_t z;
170

    
171
    roundingMode = STATUS(float_rounding_mode);
172
    roundNearestEven = ( roundingMode == float_round_nearest_even );
173
    increment = ( (int64_t) absZ1 < 0 );
174
    if ( ! roundNearestEven ) {
175
        if ( roundingMode == float_round_to_zero ) {
176
            increment = 0;
177
        }
178
        else {
179
            if ( zSign ) {
180
                increment = ( roundingMode == float_round_down ) && absZ1;
181
            }
182
            else {
183
                increment = ( roundingMode == float_round_up ) && absZ1;
184
            }
185
        }
186
    }
187
    if ( increment ) {
188
        ++absZ0;
189
        if ( absZ0 == 0 ) goto overflow;
190
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191
    }
192
    z = absZ0;
193
    if ( zSign ) z = - z;
194
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
195
 overflow:
196
        float_raise( float_flag_invalid STATUS_VAR);
197
        return
198
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
199
            : LIT64( 0x7FFFFFFFFFFFFFFF );
200
    }
201
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
202
    return z;
203

    
204
}
205

    
206
/*----------------------------------------------------------------------------
207
| Returns the fraction bits of the single-precision floating-point value `a'.
208
*----------------------------------------------------------------------------*/
209

    
210
INLINE uint32_t extractFloat32Frac( float32 a )
211
{
212

    
213
    return float32_val(a) & 0x007FFFFF;
214

    
215
}
216

    
217
/*----------------------------------------------------------------------------
218
| Returns the exponent bits of the single-precision floating-point value `a'.
219
*----------------------------------------------------------------------------*/
220

    
221
INLINE int16 extractFloat32Exp( float32 a )
222
{
223

    
224
    return ( float32_val(a)>>23 ) & 0xFF;
225

    
226
}
227

    
228
/*----------------------------------------------------------------------------
229
| Returns the sign bit of the single-precision floating-point value `a'.
230
*----------------------------------------------------------------------------*/
231

    
232
INLINE flag extractFloat32Sign( float32 a )
233
{
234

    
235
    return float32_val(a)>>31;
236

    
237
}
238

    
239
/*----------------------------------------------------------------------------
240
| If `a' is denormal and we are in flush-to-zero mode then set the
241
| input-denormal exception and return zero. Otherwise just return the value.
242
*----------------------------------------------------------------------------*/
243
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
244
{
245
    if (STATUS(flush_inputs_to_zero)) {
246
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
247
            float_raise(float_flag_input_denormal STATUS_VAR);
248
            return make_float32(float32_val(a) & 0x80000000);
249
        }
250
    }
251
    return a;
252
}
253

    
254
/*----------------------------------------------------------------------------
255
| Normalizes the subnormal single-precision floating-point value represented
256
| by the denormalized significand `aSig'.  The normalized exponent and
257
| significand are stored at the locations pointed to by `zExpPtr' and
258
| `zSigPtr', respectively.
259
*----------------------------------------------------------------------------*/
260

    
261
static void
262
 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
263
{
264
    int8 shiftCount;
265

    
266
    shiftCount = countLeadingZeros32( aSig ) - 8;
267
    *zSigPtr = aSig<<shiftCount;
268
    *zExpPtr = 1 - shiftCount;
269

    
270
}
271

    
272
/*----------------------------------------------------------------------------
273
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274
| single-precision floating-point value, returning the result.  After being
275
| shifted into the proper positions, the three fields are simply added
276
| together to form the result.  This means that any integer portion of `zSig'
277
| will be added into the exponent.  Since a properly normalized significand
278
| will have an integer portion equal to 1, the `zExp' input should be 1 less
279
| than the desired result exponent whenever `zSig' is a complete, normalized
280
| significand.
281
*----------------------------------------------------------------------------*/
282

    
283
INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
284
{
285

    
286
    return make_float32(
287
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
288

    
289
}
290

    
291
/*----------------------------------------------------------------------------
292
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293
| and significand `zSig', and returns the proper single-precision floating-
294
| point value corresponding to the abstract input.  Ordinarily, the abstract
295
| value is simply rounded and packed into the single-precision format, with
296
| the inexact exception raised if the abstract input cannot be represented
297
| exactly.  However, if the abstract value is too large, the overflow and
298
| inexact exceptions are raised and an infinity or maximal finite value is
299
| returned.  If the abstract value is too small, the input value is rounded to
300
| a subnormal number, and the underflow and inexact exceptions are raised if
301
| the abstract input cannot be represented exactly as a subnormal single-
302
| precision floating-point number.
303
|     The input significand `zSig' has its binary point between bits 30
304
| and 29, which is 7 bits to the left of the usual location.  This shifted
305
| significand must be normalized or smaller.  If `zSig' is not normalized,
306
| `zExp' must be 0; in that case, the result returned is a subnormal number,
307
| and it must not require rounding.  In the usual case that `zSig' is
308
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309
| The handling of underflow and overflow follows the IEC/IEEE Standard for
310
| Binary Floating-Point Arithmetic.
311
*----------------------------------------------------------------------------*/
312

    
313
static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
314
{
315
    int8 roundingMode;
316
    flag roundNearestEven;
317
    int8 roundIncrement, roundBits;
318
    flag isTiny;
319

    
320
    roundingMode = STATUS(float_rounding_mode);
321
    roundNearestEven = ( roundingMode == float_round_nearest_even );
322
    roundIncrement = 0x40;
323
    if ( ! roundNearestEven ) {
324
        if ( roundingMode == float_round_to_zero ) {
325
            roundIncrement = 0;
326
        }
327
        else {
328
            roundIncrement = 0x7F;
329
            if ( zSign ) {
330
                if ( roundingMode == float_round_up ) roundIncrement = 0;
331
            }
332
            else {
333
                if ( roundingMode == float_round_down ) roundIncrement = 0;
334
            }
335
        }
336
    }
337
    roundBits = zSig & 0x7F;
338
    if ( 0xFD <= (uint16_t) zExp ) {
339
        if (    ( 0xFD < zExp )
340
             || (    ( zExp == 0xFD )
341
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
342
           ) {
343
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
344
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
345
        }
346
        if ( zExp < 0 ) {
347
            if (STATUS(flush_to_zero)) {
348
                float_raise(float_flag_output_denormal STATUS_VAR);
349
                return packFloat32(zSign, 0, 0);
350
            }
351
            isTiny =
352
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
353
                || ( zExp < -1 )
354
                || ( zSig + roundIncrement < 0x80000000 );
355
            shift32RightJamming( zSig, - zExp, &zSig );
356
            zExp = 0;
357
            roundBits = zSig & 0x7F;
358
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
359
        }
360
    }
361
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
362
    zSig = ( zSig + roundIncrement )>>7;
363
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364
    if ( zSig == 0 ) zExp = 0;
365
    return packFloat32( zSign, zExp, zSig );
366

    
367
}
368

    
369
/*----------------------------------------------------------------------------
370
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371
| and significand `zSig', and returns the proper single-precision floating-
372
| point value corresponding to the abstract input.  This routine is just like
373
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
374
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
375
| floating-point exponent.
376
*----------------------------------------------------------------------------*/
377

    
378
static float32
379
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
380
{
381
    int8 shiftCount;
382

    
383
    shiftCount = countLeadingZeros32( zSig ) - 1;
384
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
385

    
386
}
387

    
388
/*----------------------------------------------------------------------------
389
| Returns the fraction bits of the double-precision floating-point value `a'.
390
*----------------------------------------------------------------------------*/
391

    
392
INLINE uint64_t extractFloat64Frac( float64 a )
393
{
394

    
395
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
396

    
397
}
398

    
399
/*----------------------------------------------------------------------------
400
| Returns the exponent bits of the double-precision floating-point value `a'.
401
*----------------------------------------------------------------------------*/
402

    
403
INLINE int16 extractFloat64Exp( float64 a )
404
{
405

    
406
    return ( float64_val(a)>>52 ) & 0x7FF;
407

    
408
}
409

    
410
/*----------------------------------------------------------------------------
411
| Returns the sign bit of the double-precision floating-point value `a'.
412
*----------------------------------------------------------------------------*/
413

    
414
INLINE flag extractFloat64Sign( float64 a )
415
{
416

    
417
    return float64_val(a)>>63;
418

    
419
}
420

    
421
/*----------------------------------------------------------------------------
422
| If `a' is denormal and we are in flush-to-zero mode then set the
423
| input-denormal exception and return zero. Otherwise just return the value.
424
*----------------------------------------------------------------------------*/
425
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
426
{
427
    if (STATUS(flush_inputs_to_zero)) {
428
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
429
            float_raise(float_flag_input_denormal STATUS_VAR);
430
            return make_float64(float64_val(a) & (1ULL << 63));
431
        }
432
    }
433
    return a;
434
}
435

    
436
/*----------------------------------------------------------------------------
437
| Normalizes the subnormal double-precision floating-point value represented
438
| by the denormalized significand `aSig'.  The normalized exponent and
439
| significand are stored at the locations pointed to by `zExpPtr' and
440
| `zSigPtr', respectively.
441
*----------------------------------------------------------------------------*/
442

    
443
static void
444
 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
445
{
446
    int8 shiftCount;
447

    
448
    shiftCount = countLeadingZeros64( aSig ) - 11;
449
    *zSigPtr = aSig<<shiftCount;
450
    *zExpPtr = 1 - shiftCount;
451

    
452
}
453

    
454
/*----------------------------------------------------------------------------
455
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
456
| double-precision floating-point value, returning the result.  After being
457
| shifted into the proper positions, the three fields are simply added
458
| together to form the result.  This means that any integer portion of `zSig'
459
| will be added into the exponent.  Since a properly normalized significand
460
| will have an integer portion equal to 1, the `zExp' input should be 1 less
461
| than the desired result exponent whenever `zSig' is a complete, normalized
462
| significand.
463
*----------------------------------------------------------------------------*/
464

    
465
INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
466
{
467

    
468
    return make_float64(
469
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
470

    
471
}
472

    
473
/*----------------------------------------------------------------------------
474
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
475
| and significand `zSig', and returns the proper double-precision floating-
476
| point value corresponding to the abstract input.  Ordinarily, the abstract
477
| value is simply rounded and packed into the double-precision format, with
478
| the inexact exception raised if the abstract input cannot be represented
479
| exactly.  However, if the abstract value is too large, the overflow and
480
| inexact exceptions are raised and an infinity or maximal finite value is
481
| returned.  If the abstract value is too small, the input value is rounded
482
| to a subnormal number, and the underflow and inexact exceptions are raised
483
| if the abstract input cannot be represented exactly as a subnormal double-
484
| precision floating-point number.
485
|     The input significand `zSig' has its binary point between bits 62
486
| and 61, which is 10 bits to the left of the usual location.  This shifted
487
| significand must be normalized or smaller.  If `zSig' is not normalized,
488
| `zExp' must be 0; in that case, the result returned is a subnormal number,
489
| and it must not require rounding.  In the usual case that `zSig' is
490
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
491
| The handling of underflow and overflow follows the IEC/IEEE Standard for
492
| Binary Floating-Point Arithmetic.
493
*----------------------------------------------------------------------------*/
494

    
495
static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
496
{
497
    int8 roundingMode;
498
    flag roundNearestEven;
499
    int16 roundIncrement, roundBits;
500
    flag isTiny;
501

    
502
    roundingMode = STATUS(float_rounding_mode);
503
    roundNearestEven = ( roundingMode == float_round_nearest_even );
504
    roundIncrement = 0x200;
505
    if ( ! roundNearestEven ) {
506
        if ( roundingMode == float_round_to_zero ) {
507
            roundIncrement = 0;
508
        }
509
        else {
510
            roundIncrement = 0x3FF;
511
            if ( zSign ) {
512
                if ( roundingMode == float_round_up ) roundIncrement = 0;
513
            }
514
            else {
515
                if ( roundingMode == float_round_down ) roundIncrement = 0;
516
            }
517
        }
518
    }
519
    roundBits = zSig & 0x3FF;
520
    if ( 0x7FD <= (uint16_t) zExp ) {
521
        if (    ( 0x7FD < zExp )
522
             || (    ( zExp == 0x7FD )
523
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
524
           ) {
525
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
526
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
527
        }
528
        if ( zExp < 0 ) {
529
            if (STATUS(flush_to_zero)) {
530
                float_raise(float_flag_output_denormal STATUS_VAR);
531
                return packFloat64(zSign, 0, 0);
532
            }
533
            isTiny =
534
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
535
                || ( zExp < -1 )
536
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
537
            shift64RightJamming( zSig, - zExp, &zSig );
538
            zExp = 0;
539
            roundBits = zSig & 0x3FF;
540
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
541
        }
542
    }
543
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
544
    zSig = ( zSig + roundIncrement )>>10;
545
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
546
    if ( zSig == 0 ) zExp = 0;
547
    return packFloat64( zSign, zExp, zSig );
548

    
549
}
550

    
551
/*----------------------------------------------------------------------------
552
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
553
| and significand `zSig', and returns the proper double-precision floating-
554
| point value corresponding to the abstract input.  This routine is just like
555
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
556
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
557
| floating-point exponent.
558
*----------------------------------------------------------------------------*/
559

    
560
static float64
561
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
562
{
563
    int8 shiftCount;
564

    
565
    shiftCount = countLeadingZeros64( zSig ) - 1;
566
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
567

    
568
}
569

    
570
/*----------------------------------------------------------------------------
571
| Returns the fraction bits of the extended double-precision floating-point
572
| value `a'.
573
*----------------------------------------------------------------------------*/
574

    
575
INLINE uint64_t extractFloatx80Frac( floatx80 a )
576
{
577

    
578
    return a.low;
579

    
580
}
581

    
582
/*----------------------------------------------------------------------------
583
| Returns the exponent bits of the extended double-precision floating-point
584
| value `a'.
585
*----------------------------------------------------------------------------*/
586

    
587
INLINE int32 extractFloatx80Exp( floatx80 a )
588
{
589

    
590
    return a.high & 0x7FFF;
591

    
592
}
593

    
594
/*----------------------------------------------------------------------------
595
| Returns the sign bit of the extended double-precision floating-point value
596
| `a'.
597
*----------------------------------------------------------------------------*/
598

    
599
INLINE flag extractFloatx80Sign( floatx80 a )
600
{
601

    
602
    return a.high>>15;
603

    
604
}
605

    
606
/*----------------------------------------------------------------------------
607
| Normalizes the subnormal extended double-precision floating-point value
608
| represented by the denormalized significand `aSig'.  The normalized exponent
609
| and significand are stored at the locations pointed to by `zExpPtr' and
610
| `zSigPtr', respectively.
611
*----------------------------------------------------------------------------*/
612

    
613
static void
614
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
615
{
616
    int8 shiftCount;
617

    
618
    shiftCount = countLeadingZeros64( aSig );
619
    *zSigPtr = aSig<<shiftCount;
620
    *zExpPtr = 1 - shiftCount;
621

    
622
}
623

    
624
/*----------------------------------------------------------------------------
625
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
626
| extended double-precision floating-point value, returning the result.
627
*----------------------------------------------------------------------------*/
628

    
629
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
630
{
631
    floatx80 z;
632

    
633
    z.low = zSig;
634
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
635
    return z;
636

    
637
}
638

    
639
/*----------------------------------------------------------------------------
640
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
641
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
642
| and returns the proper extended double-precision floating-point value
643
| corresponding to the abstract input.  Ordinarily, the abstract value is
644
| rounded and packed into the extended double-precision format, with the
645
| inexact exception raised if the abstract input cannot be represented
646
| exactly.  However, if the abstract value is too large, the overflow and
647
| inexact exceptions are raised and an infinity or maximal finite value is
648
| returned.  If the abstract value is too small, the input value is rounded to
649
| a subnormal number, and the underflow and inexact exceptions are raised if
650
| the abstract input cannot be represented exactly as a subnormal extended
651
| double-precision floating-point number.
652
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
653
| number of bits as single or double precision, respectively.  Otherwise, the
654
| result is rounded to the full precision of the extended double-precision
655
| format.
656
|     The input significand must be normalized or smaller.  If the input
657
| significand is not normalized, `zExp' must be 0; in that case, the result
658
| returned is a subnormal number, and it must not require rounding.  The
659
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
660
| Floating-Point Arithmetic.
661
*----------------------------------------------------------------------------*/
662

    
663
static floatx80
664
 roundAndPackFloatx80(
665
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
666
 STATUS_PARAM)
667
{
668
    int8 roundingMode;
669
    flag roundNearestEven, increment, isTiny;
670
    int64 roundIncrement, roundMask, roundBits;
671

    
672
    roundingMode = STATUS(float_rounding_mode);
673
    roundNearestEven = ( roundingMode == float_round_nearest_even );
674
    if ( roundingPrecision == 80 ) goto precision80;
675
    if ( roundingPrecision == 64 ) {
676
        roundIncrement = LIT64( 0x0000000000000400 );
677
        roundMask = LIT64( 0x00000000000007FF );
678
    }
679
    else if ( roundingPrecision == 32 ) {
680
        roundIncrement = LIT64( 0x0000008000000000 );
681
        roundMask = LIT64( 0x000000FFFFFFFFFF );
682
    }
683
    else {
684
        goto precision80;
685
    }
686
    zSig0 |= ( zSig1 != 0 );
687
    if ( ! roundNearestEven ) {
688
        if ( roundingMode == float_round_to_zero ) {
689
            roundIncrement = 0;
690
        }
691
        else {
692
            roundIncrement = roundMask;
693
            if ( zSign ) {
694
                if ( roundingMode == float_round_up ) roundIncrement = 0;
695
            }
696
            else {
697
                if ( roundingMode == float_round_down ) roundIncrement = 0;
698
            }
699
        }
700
    }
701
    roundBits = zSig0 & roundMask;
702
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
703
        if (    ( 0x7FFE < zExp )
704
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
705
           ) {
706
            goto overflow;
707
        }
708
        if ( zExp <= 0 ) {
709
            if (STATUS(flush_to_zero)) {
710
                float_raise(float_flag_output_denormal STATUS_VAR);
711
                return packFloatx80(zSign, 0, 0);
712
            }
713
            isTiny =
714
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
715
                || ( zExp < 0 )
716
                || ( zSig0 <= zSig0 + roundIncrement );
717
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
718
            zExp = 0;
719
            roundBits = zSig0 & roundMask;
720
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
721
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
722
            zSig0 += roundIncrement;
723
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
724
            roundIncrement = roundMask + 1;
725
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
726
                roundMask |= roundIncrement;
727
            }
728
            zSig0 &= ~ roundMask;
729
            return packFloatx80( zSign, zExp, zSig0 );
730
        }
731
    }
732
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
733
    zSig0 += roundIncrement;
734
    if ( zSig0 < roundIncrement ) {
735
        ++zExp;
736
        zSig0 = LIT64( 0x8000000000000000 );
737
    }
738
    roundIncrement = roundMask + 1;
739
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
740
        roundMask |= roundIncrement;
741
    }
742
    zSig0 &= ~ roundMask;
743
    if ( zSig0 == 0 ) zExp = 0;
744
    return packFloatx80( zSign, zExp, zSig0 );
745
 precision80:
746
    increment = ( (int64_t) zSig1 < 0 );
747
    if ( ! roundNearestEven ) {
748
        if ( roundingMode == float_round_to_zero ) {
749
            increment = 0;
750
        }
751
        else {
752
            if ( zSign ) {
753
                increment = ( roundingMode == float_round_down ) && zSig1;
754
            }
755
            else {
756
                increment = ( roundingMode == float_round_up ) && zSig1;
757
            }
758
        }
759
    }
760
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
761
        if (    ( 0x7FFE < zExp )
762
             || (    ( zExp == 0x7FFE )
763
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
764
                  && increment
765
                )
766
           ) {
767
            roundMask = 0;
768
 overflow:
769
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
770
            if (    ( roundingMode == float_round_to_zero )
771
                 || ( zSign && ( roundingMode == float_round_up ) )
772
                 || ( ! zSign && ( roundingMode == float_round_down ) )
773
               ) {
774
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
775
            }
776
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
777
        }
778
        if ( zExp <= 0 ) {
779
            isTiny =
780
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
781
                || ( zExp < 0 )
782
                || ! increment
783
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
784
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
785
            zExp = 0;
786
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
787
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
788
            if ( roundNearestEven ) {
789
                increment = ( (int64_t) zSig1 < 0 );
790
            }
791
            else {
792
                if ( zSign ) {
793
                    increment = ( roundingMode == float_round_down ) && zSig1;
794
                }
795
                else {
796
                    increment = ( roundingMode == float_round_up ) && zSig1;
797
                }
798
            }
799
            if ( increment ) {
800
                ++zSig0;
801
                zSig0 &=
802
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
803
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
804
            }
805
            return packFloatx80( zSign, zExp, zSig0 );
806
        }
807
    }
808
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
809
    if ( increment ) {
810
        ++zSig0;
811
        if ( zSig0 == 0 ) {
812
            ++zExp;
813
            zSig0 = LIT64( 0x8000000000000000 );
814
        }
815
        else {
816
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
817
        }
818
    }
819
    else {
820
        if ( zSig0 == 0 ) zExp = 0;
821
    }
822
    return packFloatx80( zSign, zExp, zSig0 );
823

    
824
}
825

    
826
/*----------------------------------------------------------------------------
827
| Takes an abstract floating-point value having sign `zSign', exponent
828
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
829
| and returns the proper extended double-precision floating-point value
830
| corresponding to the abstract input.  This routine is just like
831
| `roundAndPackFloatx80' except that the input significand does not have to be
832
| normalized.
833
*----------------------------------------------------------------------------*/
834

    
835
static floatx80
836
 normalizeRoundAndPackFloatx80(
837
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
838
 STATUS_PARAM)
839
{
840
    int8 shiftCount;
841

    
842
    if ( zSig0 == 0 ) {
843
        zSig0 = zSig1;
844
        zSig1 = 0;
845
        zExp -= 64;
846
    }
847
    shiftCount = countLeadingZeros64( zSig0 );
848
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
849
    zExp -= shiftCount;
850
    return
851
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
852

    
853
}
854

    
855
/*----------------------------------------------------------------------------
856
| Returns the least-significant 64 fraction bits of the quadruple-precision
857
| floating-point value `a'.
858
*----------------------------------------------------------------------------*/
859

    
860
INLINE uint64_t extractFloat128Frac1( float128 a )
861
{
862

    
863
    return a.low;
864

    
865
}
866

    
867
/*----------------------------------------------------------------------------
868
| Returns the most-significant 48 fraction bits of the quadruple-precision
869
| floating-point value `a'.
870
*----------------------------------------------------------------------------*/
871

    
872
INLINE uint64_t extractFloat128Frac0( float128 a )
873
{
874

    
875
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
876

    
877
}
878

    
879
/*----------------------------------------------------------------------------
880
| Returns the exponent bits of the quadruple-precision floating-point value
881
| `a'.
882
*----------------------------------------------------------------------------*/
883

    
884
INLINE int32 extractFloat128Exp( float128 a )
885
{
886

    
887
    return ( a.high>>48 ) & 0x7FFF;
888

    
889
}
890

    
891
/*----------------------------------------------------------------------------
892
| Returns the sign bit of the quadruple-precision floating-point value `a'.
893
*----------------------------------------------------------------------------*/
894

    
895
INLINE flag extractFloat128Sign( float128 a )
896
{
897

    
898
    return a.high>>63;
899

    
900
}
901

    
902
/*----------------------------------------------------------------------------
903
| Normalizes the subnormal quadruple-precision floating-point value
904
| represented by the denormalized significand formed by the concatenation of
905
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
906
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
907
| significand are stored at the location pointed to by `zSig0Ptr', and the
908
| least significant 64 bits of the normalized significand are stored at the
909
| location pointed to by `zSig1Ptr'.
910
*----------------------------------------------------------------------------*/
911

    
912
static void
913
 normalizeFloat128Subnormal(
914
     uint64_t aSig0,
915
     uint64_t aSig1,
916
     int32 *zExpPtr,
917
     uint64_t *zSig0Ptr,
918
     uint64_t *zSig1Ptr
919
 )
920
{
921
    int8 shiftCount;
922

    
923
    if ( aSig0 == 0 ) {
924
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
925
        if ( shiftCount < 0 ) {
926
            *zSig0Ptr = aSig1>>( - shiftCount );
927
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
928
        }
929
        else {
930
            *zSig0Ptr = aSig1<<shiftCount;
931
            *zSig1Ptr = 0;
932
        }
933
        *zExpPtr = - shiftCount - 63;
934
    }
935
    else {
936
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
937
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938
        *zExpPtr = 1 - shiftCount;
939
    }
940

    
941
}
942

    
943
/*----------------------------------------------------------------------------
944
| Packs the sign `zSign', the exponent `zExp', and the significand formed
945
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
946
| floating-point value, returning the result.  After being shifted into the
947
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
948
| added together to form the most significant 32 bits of the result.  This
949
| means that any integer portion of `zSig0' will be added into the exponent.
950
| Since a properly normalized significand will have an integer portion equal
951
| to 1, the `zExp' input should be 1 less than the desired result exponent
952
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
953
| significand.
954
*----------------------------------------------------------------------------*/
955

    
956
INLINE float128
957
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
958
{
959
    float128 z;
960

    
961
    z.low = zSig1;
962
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
963
    return z;
964

    
965
}
966

    
967
/*----------------------------------------------------------------------------
968
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
969
| and extended significand formed by the concatenation of `zSig0', `zSig1',
970
| and `zSig2', and returns the proper quadruple-precision floating-point value
971
| corresponding to the abstract input.  Ordinarily, the abstract value is
972
| simply rounded and packed into the quadruple-precision format, with the
973
| inexact exception raised if the abstract input cannot be represented
974
| exactly.  However, if the abstract value is too large, the overflow and
975
| inexact exceptions are raised and an infinity or maximal finite value is
976
| returned.  If the abstract value is too small, the input value is rounded to
977
| a subnormal number, and the underflow and inexact exceptions are raised if
978
| the abstract input cannot be represented exactly as a subnormal quadruple-
979
| precision floating-point number.
980
|     The input significand must be normalized or smaller.  If the input
981
| significand is not normalized, `zExp' must be 0; in that case, the result
982
| returned is a subnormal number, and it must not require rounding.  In the
983
| usual case that the input significand is normalized, `zExp' must be 1 less
984
| than the ``true'' floating-point exponent.  The handling of underflow and
985
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
986
*----------------------------------------------------------------------------*/
987

    
988
static float128
989
 roundAndPackFloat128(
990
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
991
{
992
    int8 roundingMode;
993
    flag roundNearestEven, increment, isTiny;
994

    
995
    roundingMode = STATUS(float_rounding_mode);
996
    roundNearestEven = ( roundingMode == float_round_nearest_even );
997
    increment = ( (int64_t) zSig2 < 0 );
998
    if ( ! roundNearestEven ) {
999
        if ( roundingMode == float_round_to_zero ) {
1000
            increment = 0;
1001
        }
1002
        else {
1003
            if ( zSign ) {
1004
                increment = ( roundingMode == float_round_down ) && zSig2;
1005
            }
1006
            else {
1007
                increment = ( roundingMode == float_round_up ) && zSig2;
1008
            }
1009
        }
1010
    }
1011
    if ( 0x7FFD <= (uint32_t) zExp ) {
1012
        if (    ( 0x7FFD < zExp )
1013
             || (    ( zExp == 0x7FFD )
1014
                  && eq128(
1015
                         LIT64( 0x0001FFFFFFFFFFFF ),
1016
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1017
                         zSig0,
1018
                         zSig1
1019
                     )
1020
                  && increment
1021
                )
1022
           ) {
1023
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1024
            if (    ( roundingMode == float_round_to_zero )
1025
                 || ( zSign && ( roundingMode == float_round_up ) )
1026
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1027
               ) {
1028
                return
1029
                    packFloat128(
1030
                        zSign,
1031
                        0x7FFE,
1032
                        LIT64( 0x0000FFFFFFFFFFFF ),
1033
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1034
                    );
1035
            }
1036
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1037
        }
1038
        if ( zExp < 0 ) {
1039
            if (STATUS(flush_to_zero)) {
1040
                float_raise(float_flag_output_denormal STATUS_VAR);
1041
                return packFloat128(zSign, 0, 0, 0);
1042
            }
1043
            isTiny =
1044
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1045
                || ( zExp < -1 )
1046
                || ! increment
1047
                || lt128(
1048
                       zSig0,
1049
                       zSig1,
1050
                       LIT64( 0x0001FFFFFFFFFFFF ),
1051
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1052
                   );
1053
            shift128ExtraRightJamming(
1054
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1055
            zExp = 0;
1056
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1057
            if ( roundNearestEven ) {
1058
                increment = ( (int64_t) zSig2 < 0 );
1059
            }
1060
            else {
1061
                if ( zSign ) {
1062
                    increment = ( roundingMode == float_round_down ) && zSig2;
1063
                }
1064
                else {
1065
                    increment = ( roundingMode == float_round_up ) && zSig2;
1066
                }
1067
            }
1068
        }
1069
    }
1070
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1071
    if ( increment ) {
1072
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1073
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074
    }
1075
    else {
1076
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077
    }
1078
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1079

    
1080
}
1081

    
1082
/*----------------------------------------------------------------------------
1083
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1085
| returns the proper quadruple-precision floating-point value corresponding
1086
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1087
| except that the input significand has fewer bits and does not have to be
1088
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1089
| point exponent.
1090
*----------------------------------------------------------------------------*/
1091

    
1092
static float128
1093
 normalizeRoundAndPackFloat128(
1094
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1095
{
1096
    int8 shiftCount;
1097
    uint64_t zSig2;
1098

    
1099
    if ( zSig0 == 0 ) {
1100
        zSig0 = zSig1;
1101
        zSig1 = 0;
1102
        zExp -= 64;
1103
    }
1104
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105
    if ( 0 <= shiftCount ) {
1106
        zSig2 = 0;
1107
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108
    }
1109
    else {
1110
        shift128ExtraRightJamming(
1111
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112
    }
1113
    zExp -= shiftCount;
1114
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1115

    
1116
}
1117

    
1118
/*----------------------------------------------------------------------------
1119
| Returns the result of converting the 32-bit two's complement integer `a'
1120
| to the single-precision floating-point format.  The conversion is performed
1121
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1122
*----------------------------------------------------------------------------*/
1123

    
1124
float32 int32_to_float32( int32 a STATUS_PARAM )
1125
{
1126
    flag zSign;
1127

    
1128
    if ( a == 0 ) return float32_zero;
1129
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1130
    zSign = ( a < 0 );
1131
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1132

    
1133
}
1134

    
1135
/*----------------------------------------------------------------------------
1136
| Returns the result of converting the 32-bit two's complement integer `a'
1137
| to the double-precision floating-point format.  The conversion is performed
1138
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139
*----------------------------------------------------------------------------*/
1140

    
1141
float64 int32_to_float64( int32 a STATUS_PARAM )
1142
{
1143
    flag zSign;
1144
    uint32 absA;
1145
    int8 shiftCount;
1146
    uint64_t zSig;
1147

    
1148
    if ( a == 0 ) return float64_zero;
1149
    zSign = ( a < 0 );
1150
    absA = zSign ? - a : a;
1151
    shiftCount = countLeadingZeros32( absA ) + 21;
1152
    zSig = absA;
1153
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1154

    
1155
}
1156

    
1157
/*----------------------------------------------------------------------------
1158
| Returns the result of converting the 32-bit two's complement integer `a'
1159
| to the extended double-precision floating-point format.  The conversion
1160
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1161
| Arithmetic.
1162
*----------------------------------------------------------------------------*/
1163

    
1164
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1165
{
1166
    flag zSign;
1167
    uint32 absA;
1168
    int8 shiftCount;
1169
    uint64_t zSig;
1170

    
1171
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1172
    zSign = ( a < 0 );
1173
    absA = zSign ? - a : a;
1174
    shiftCount = countLeadingZeros32( absA ) + 32;
1175
    zSig = absA;
1176
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1177

    
1178
}
1179

    
1180
/*----------------------------------------------------------------------------
1181
| Returns the result of converting the 32-bit two's complement integer `a' to
1182
| the quadruple-precision floating-point format.  The conversion is performed
1183
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184
*----------------------------------------------------------------------------*/
1185

    
1186
float128 int32_to_float128( int32 a STATUS_PARAM )
1187
{
1188
    flag zSign;
1189
    uint32 absA;
1190
    int8 shiftCount;
1191
    uint64_t zSig0;
1192

    
1193
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194
    zSign = ( a < 0 );
1195
    absA = zSign ? - a : a;
1196
    shiftCount = countLeadingZeros32( absA ) + 17;
1197
    zSig0 = absA;
1198
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1199

    
1200
}
1201

    
1202
/*----------------------------------------------------------------------------
1203
| Returns the result of converting the 64-bit two's complement integer `a'
1204
| to the single-precision floating-point format.  The conversion is performed
1205
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1206
*----------------------------------------------------------------------------*/
1207

    
1208
float32 int64_to_float32( int64 a STATUS_PARAM )
1209
{
1210
    flag zSign;
1211
    uint64 absA;
1212
    int8 shiftCount;
1213

    
1214
    if ( a == 0 ) return float32_zero;
1215
    zSign = ( a < 0 );
1216
    absA = zSign ? - a : a;
1217
    shiftCount = countLeadingZeros64( absA ) - 40;
1218
    if ( 0 <= shiftCount ) {
1219
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1220
    }
1221
    else {
1222
        shiftCount += 7;
1223
        if ( shiftCount < 0 ) {
1224
            shift64RightJamming( absA, - shiftCount, &absA );
1225
        }
1226
        else {
1227
            absA <<= shiftCount;
1228
        }
1229
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1230
    }
1231

    
1232
}
1233

    
1234
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1235
{
1236
    int8 shiftCount;
1237

    
1238
    if ( a == 0 ) return float32_zero;
1239
    shiftCount = countLeadingZeros64( a ) - 40;
1240
    if ( 0 <= shiftCount ) {
1241
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1242
    }
1243
    else {
1244
        shiftCount += 7;
1245
        if ( shiftCount < 0 ) {
1246
            shift64RightJamming( a, - shiftCount, &a );
1247
        }
1248
        else {
1249
            a <<= shiftCount;
1250
        }
1251
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1252
    }
1253
}
1254

    
1255
/*----------------------------------------------------------------------------
1256
| Returns the result of converting the 64-bit two's complement integer `a'
1257
| to the double-precision floating-point format.  The conversion is performed
1258
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259
*----------------------------------------------------------------------------*/
1260

    
1261
float64 int64_to_float64( int64 a STATUS_PARAM )
1262
{
1263
    flag zSign;
1264

    
1265
    if ( a == 0 ) return float64_zero;
1266
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1267
        return packFloat64( 1, 0x43E, 0 );
1268
    }
1269
    zSign = ( a < 0 );
1270
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1271

    
1272
}
1273

    
1274
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1275
{
1276
    if ( a == 0 ) return float64_zero;
1277
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1278

    
1279
}
1280

    
1281
/*----------------------------------------------------------------------------
1282
| Returns the result of converting the 64-bit two's complement integer `a'
1283
| to the extended double-precision floating-point format.  The conversion
1284
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1285
| Arithmetic.
1286
*----------------------------------------------------------------------------*/
1287

    
1288
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1289
{
1290
    flag zSign;
1291
    uint64 absA;
1292
    int8 shiftCount;
1293

    
1294
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1295
    zSign = ( a < 0 );
1296
    absA = zSign ? - a : a;
1297
    shiftCount = countLeadingZeros64( absA );
1298
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1299

    
1300
}
1301

    
1302
/*----------------------------------------------------------------------------
1303
| Returns the result of converting the 64-bit two's complement integer `a' to
1304
| the quadruple-precision floating-point format.  The conversion is performed
1305
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306
*----------------------------------------------------------------------------*/
1307

    
1308
float128 int64_to_float128( int64 a STATUS_PARAM )
1309
{
1310
    flag zSign;
1311
    uint64 absA;
1312
    int8 shiftCount;
1313
    int32 zExp;
1314
    uint64_t zSig0, zSig1;
1315

    
1316
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1317
    zSign = ( a < 0 );
1318
    absA = zSign ? - a : a;
1319
    shiftCount = countLeadingZeros64( absA ) + 49;
1320
    zExp = 0x406E - shiftCount;
1321
    if ( 64 <= shiftCount ) {
1322
        zSig1 = 0;
1323
        zSig0 = absA;
1324
        shiftCount -= 64;
1325
    }
1326
    else {
1327
        zSig1 = absA;
1328
        zSig0 = 0;
1329
    }
1330
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1331
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1332

    
1333
}
1334

    
1335
/*----------------------------------------------------------------------------
1336
| Returns the result of converting the single-precision floating-point value
1337
| `a' to the 32-bit two's complement integer format.  The conversion is
1338
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1339
| Arithmetic---which means in particular that the conversion is rounded
1340
| according to the current rounding mode.  If `a' is a NaN, the largest
1341
| positive integer is returned.  Otherwise, if the conversion overflows, the
1342
| largest integer with the same sign as `a' is returned.
1343
*----------------------------------------------------------------------------*/
1344

    
1345
int32 float32_to_int32( float32 a STATUS_PARAM )
1346
{
1347
    flag aSign;
1348
    int16 aExp, shiftCount;
1349
    uint32_t aSig;
1350
    uint64_t aSig64;
1351

    
1352
    a = float32_squash_input_denormal(a STATUS_VAR);
1353
    aSig = extractFloat32Frac( a );
1354
    aExp = extractFloat32Exp( a );
1355
    aSign = extractFloat32Sign( a );
1356
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1357
    if ( aExp ) aSig |= 0x00800000;
1358
    shiftCount = 0xAF - aExp;
1359
    aSig64 = aSig;
1360
    aSig64 <<= 32;
1361
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1362
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1363

    
1364
}
1365

    
1366
/*----------------------------------------------------------------------------
1367
| Returns the result of converting the single-precision floating-point value
1368
| `a' to the 32-bit two's complement integer format.  The conversion is
1369
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1370
| Arithmetic, except that the conversion is always rounded toward zero.
1371
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1372
| the conversion overflows, the largest integer with the same sign as `a' is
1373
| returned.
1374
*----------------------------------------------------------------------------*/
1375

    
1376
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1377
{
1378
    flag aSign;
1379
    int16 aExp, shiftCount;
1380
    uint32_t aSig;
1381
    int32_t z;
1382
    a = float32_squash_input_denormal(a STATUS_VAR);
1383

    
1384
    aSig = extractFloat32Frac( a );
1385
    aExp = extractFloat32Exp( a );
1386
    aSign = extractFloat32Sign( a );
1387
    shiftCount = aExp - 0x9E;
1388
    if ( 0 <= shiftCount ) {
1389
        if ( float32_val(a) != 0xCF000000 ) {
1390
            float_raise( float_flag_invalid STATUS_VAR);
1391
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1392
        }
1393
        return (int32_t) 0x80000000;
1394
    }
1395
    else if ( aExp <= 0x7E ) {
1396
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1397
        return 0;
1398
    }
1399
    aSig = ( aSig | 0x00800000 )<<8;
1400
    z = aSig>>( - shiftCount );
1401
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1402
        STATUS(float_exception_flags) |= float_flag_inexact;
1403
    }
1404
    if ( aSign ) z = - z;
1405
    return z;
1406

    
1407
}
1408

    
1409
/*----------------------------------------------------------------------------
1410
| Returns the result of converting the single-precision floating-point value
1411
| `a' to the 16-bit two's complement integer format.  The conversion is
1412
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1413
| Arithmetic, except that the conversion is always rounded toward zero.
1414
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1415
| the conversion overflows, the largest integer with the same sign as `a' is
1416
| returned.
1417
*----------------------------------------------------------------------------*/
1418

    
1419
int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1420
{
1421
    flag aSign;
1422
    int16 aExp, shiftCount;
1423
    uint32_t aSig;
1424
    int32 z;
1425

    
1426
    aSig = extractFloat32Frac( a );
1427
    aExp = extractFloat32Exp( a );
1428
    aSign = extractFloat32Sign( a );
1429
    shiftCount = aExp - 0x8E;
1430
    if ( 0 <= shiftCount ) {
1431
        if ( float32_val(a) != 0xC7000000 ) {
1432
            float_raise( float_flag_invalid STATUS_VAR);
1433
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1434
                return 0x7FFF;
1435
            }
1436
        }
1437
        return (int32_t) 0xffff8000;
1438
    }
1439
    else if ( aExp <= 0x7E ) {
1440
        if ( aExp | aSig ) {
1441
            STATUS(float_exception_flags) |= float_flag_inexact;
1442
        }
1443
        return 0;
1444
    }
1445
    shiftCount -= 0x10;
1446
    aSig = ( aSig | 0x00800000 )<<8;
1447
    z = aSig>>( - shiftCount );
1448
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1449
        STATUS(float_exception_flags) |= float_flag_inexact;
1450
    }
1451
    if ( aSign ) {
1452
        z = - z;
1453
    }
1454
    return z;
1455

    
1456
}
1457

    
1458
/*----------------------------------------------------------------------------
1459
| Returns the result of converting the single-precision floating-point value
1460
| `a' to the 64-bit two's complement integer format.  The conversion is
1461
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1462
| Arithmetic---which means in particular that the conversion is rounded
1463
| according to the current rounding mode.  If `a' is a NaN, the largest
1464
| positive integer is returned.  Otherwise, if the conversion overflows, the
1465
| largest integer with the same sign as `a' is returned.
1466
*----------------------------------------------------------------------------*/
1467

    
1468
int64 float32_to_int64( float32 a STATUS_PARAM )
1469
{
1470
    flag aSign;
1471
    int16 aExp, shiftCount;
1472
    uint32_t aSig;
1473
    uint64_t aSig64, aSigExtra;
1474
    a = float32_squash_input_denormal(a STATUS_VAR);
1475

    
1476
    aSig = extractFloat32Frac( a );
1477
    aExp = extractFloat32Exp( a );
1478
    aSign = extractFloat32Sign( a );
1479
    shiftCount = 0xBE - aExp;
1480
    if ( shiftCount < 0 ) {
1481
        float_raise( float_flag_invalid STATUS_VAR);
1482
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1483
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1484
        }
1485
        return (int64_t) LIT64( 0x8000000000000000 );
1486
    }
1487
    if ( aExp ) aSig |= 0x00800000;
1488
    aSig64 = aSig;
1489
    aSig64 <<= 40;
1490
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1491
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1492

    
1493
}
1494

    
1495
/*----------------------------------------------------------------------------
1496
| Returns the result of converting the single-precision floating-point value
1497
| `a' to the 64-bit two's complement integer format.  The conversion is
1498
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1499
| Arithmetic, except that the conversion is always rounded toward zero.  If
1500
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1501
| conversion overflows, the largest integer with the same sign as `a' is
1502
| returned.
1503
*----------------------------------------------------------------------------*/
1504

    
1505
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1506
{
1507
    flag aSign;
1508
    int16 aExp, shiftCount;
1509
    uint32_t aSig;
1510
    uint64_t aSig64;
1511
    int64 z;
1512
    a = float32_squash_input_denormal(a STATUS_VAR);
1513

    
1514
    aSig = extractFloat32Frac( a );
1515
    aExp = extractFloat32Exp( a );
1516
    aSign = extractFloat32Sign( a );
1517
    shiftCount = aExp - 0xBE;
1518
    if ( 0 <= shiftCount ) {
1519
        if ( float32_val(a) != 0xDF000000 ) {
1520
            float_raise( float_flag_invalid STATUS_VAR);
1521
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1523
            }
1524
        }
1525
        return (int64_t) LIT64( 0x8000000000000000 );
1526
    }
1527
    else if ( aExp <= 0x7E ) {
1528
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1529
        return 0;
1530
    }
1531
    aSig64 = aSig | 0x00800000;
1532
    aSig64 <<= 40;
1533
    z = aSig64>>( - shiftCount );
1534
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1535
        STATUS(float_exception_flags) |= float_flag_inexact;
1536
    }
1537
    if ( aSign ) z = - z;
1538
    return z;
1539

    
1540
}
1541

    
1542
/*----------------------------------------------------------------------------
1543
| Returns the result of converting the single-precision floating-point value
1544
| `a' to the double-precision floating-point format.  The conversion is
1545
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1546
| Arithmetic.
1547
*----------------------------------------------------------------------------*/
1548

    
1549
float64 float32_to_float64( float32 a STATUS_PARAM )
1550
{
1551
    flag aSign;
1552
    int16 aExp;
1553
    uint32_t aSig;
1554
    a = float32_squash_input_denormal(a STATUS_VAR);
1555

    
1556
    aSig = extractFloat32Frac( a );
1557
    aExp = extractFloat32Exp( a );
1558
    aSign = extractFloat32Sign( a );
1559
    if ( aExp == 0xFF ) {
1560
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1561
        return packFloat64( aSign, 0x7FF, 0 );
1562
    }
1563
    if ( aExp == 0 ) {
1564
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1565
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1566
        --aExp;
1567
    }
1568
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1569

    
1570
}
1571

    
1572
/*----------------------------------------------------------------------------
1573
| Returns the result of converting the single-precision floating-point value
1574
| `a' to the extended double-precision floating-point format.  The conversion
1575
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576
| Arithmetic.
1577
*----------------------------------------------------------------------------*/
1578

    
1579
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1580
{
1581
    flag aSign;
1582
    int16 aExp;
1583
    uint32_t aSig;
1584

    
1585
    a = float32_squash_input_denormal(a STATUS_VAR);
1586
    aSig = extractFloat32Frac( a );
1587
    aExp = extractFloat32Exp( a );
1588
    aSign = extractFloat32Sign( a );
1589
    if ( aExp == 0xFF ) {
1590
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1591
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592
    }
1593
    if ( aExp == 0 ) {
1594
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1595
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596
    }
1597
    aSig |= 0x00800000;
1598
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1599

    
1600
}
1601

    
1602
/*----------------------------------------------------------------------------
1603
| Returns the result of converting the single-precision floating-point value
1604
| `a' to the double-precision floating-point format.  The conversion is
1605
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1606
| Arithmetic.
1607
*----------------------------------------------------------------------------*/
1608

    
1609
float128 float32_to_float128( float32 a STATUS_PARAM )
1610
{
1611
    flag aSign;
1612
    int16 aExp;
1613
    uint32_t aSig;
1614

    
1615
    a = float32_squash_input_denormal(a STATUS_VAR);
1616
    aSig = extractFloat32Frac( a );
1617
    aExp = extractFloat32Exp( a );
1618
    aSign = extractFloat32Sign( a );
1619
    if ( aExp == 0xFF ) {
1620
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1621
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1622
    }
1623
    if ( aExp == 0 ) {
1624
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1625
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1626
        --aExp;
1627
    }
1628
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1629

    
1630
}
1631

    
1632
/*----------------------------------------------------------------------------
1633
| Rounds the single-precision floating-point value `a' to an integer, and
1634
| returns the result as a single-precision floating-point value.  The
1635
| operation is performed according to the IEC/IEEE Standard for Binary
1636
| Floating-Point Arithmetic.
1637
*----------------------------------------------------------------------------*/
1638

    
1639
float32 float32_round_to_int( float32 a STATUS_PARAM)
1640
{
1641
    flag aSign;
1642
    int16 aExp;
1643
    uint32_t lastBitMask, roundBitsMask;
1644
    int8 roundingMode;
1645
    uint32_t z;
1646
    a = float32_squash_input_denormal(a STATUS_VAR);
1647

    
1648
    aExp = extractFloat32Exp( a );
1649
    if ( 0x96 <= aExp ) {
1650
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1651
            return propagateFloat32NaN( a, a STATUS_VAR );
1652
        }
1653
        return a;
1654
    }
1655
    if ( aExp <= 0x7E ) {
1656
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1657
        STATUS(float_exception_flags) |= float_flag_inexact;
1658
        aSign = extractFloat32Sign( a );
1659
        switch ( STATUS(float_rounding_mode) ) {
1660
         case float_round_nearest_even:
1661
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1662
                return packFloat32( aSign, 0x7F, 0 );
1663
            }
1664
            break;
1665
         case float_round_down:
1666
            return make_float32(aSign ? 0xBF800000 : 0);
1667
         case float_round_up:
1668
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1669
        }
1670
        return packFloat32( aSign, 0, 0 );
1671
    }
1672
    lastBitMask = 1;
1673
    lastBitMask <<= 0x96 - aExp;
1674
    roundBitsMask = lastBitMask - 1;
1675
    z = float32_val(a);
1676
    roundingMode = STATUS(float_rounding_mode);
1677
    if ( roundingMode == float_round_nearest_even ) {
1678
        z += lastBitMask>>1;
1679
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680
    }
1681
    else if ( roundingMode != float_round_to_zero ) {
1682
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1683
            z += roundBitsMask;
1684
        }
1685
    }
1686
    z &= ~ roundBitsMask;
1687
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1688
    return make_float32(z);
1689

    
1690
}
1691

    
1692
/*----------------------------------------------------------------------------
1693
| Returns the result of adding the absolute values of the single-precision
1694
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1695
| before being returned.  `zSign' is ignored if the result is a NaN.
1696
| The addition is performed according to the IEC/IEEE Standard for Binary
1697
| Floating-Point Arithmetic.
1698
*----------------------------------------------------------------------------*/
1699

    
1700
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1701
{
1702
    int16 aExp, bExp, zExp;
1703
    uint32_t aSig, bSig, zSig;
1704
    int16 expDiff;
1705

    
1706
    aSig = extractFloat32Frac( a );
1707
    aExp = extractFloat32Exp( a );
1708
    bSig = extractFloat32Frac( b );
1709
    bExp = extractFloat32Exp( b );
1710
    expDiff = aExp - bExp;
1711
    aSig <<= 6;
1712
    bSig <<= 6;
1713
    if ( 0 < expDiff ) {
1714
        if ( aExp == 0xFF ) {
1715
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1716
            return a;
1717
        }
1718
        if ( bExp == 0 ) {
1719
            --expDiff;
1720
        }
1721
        else {
1722
            bSig |= 0x20000000;
1723
        }
1724
        shift32RightJamming( bSig, expDiff, &bSig );
1725
        zExp = aExp;
1726
    }
1727
    else if ( expDiff < 0 ) {
1728
        if ( bExp == 0xFF ) {
1729
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1730
            return packFloat32( zSign, 0xFF, 0 );
1731
        }
1732
        if ( aExp == 0 ) {
1733
            ++expDiff;
1734
        }
1735
        else {
1736
            aSig |= 0x20000000;
1737
        }
1738
        shift32RightJamming( aSig, - expDiff, &aSig );
1739
        zExp = bExp;
1740
    }
1741
    else {
1742
        if ( aExp == 0xFF ) {
1743
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1744
            return a;
1745
        }
1746
        if ( aExp == 0 ) {
1747
            if (STATUS(flush_to_zero)) {
1748
                if (aSig | bSig) {
1749
                    float_raise(float_flag_output_denormal STATUS_VAR);
1750
                }
1751
                return packFloat32(zSign, 0, 0);
1752
            }
1753
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1754
        }
1755
        zSig = 0x40000000 + aSig + bSig;
1756
        zExp = aExp;
1757
        goto roundAndPack;
1758
    }
1759
    aSig |= 0x20000000;
1760
    zSig = ( aSig + bSig )<<1;
1761
    --zExp;
1762
    if ( (int32_t) zSig < 0 ) {
1763
        zSig = aSig + bSig;
1764
        ++zExp;
1765
    }
1766
 roundAndPack:
1767
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1768

    
1769
}
1770

    
1771
/*----------------------------------------------------------------------------
1772
| Returns the result of subtracting the absolute values of the single-
1773
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1774
| difference is negated before being returned.  `zSign' is ignored if the
1775
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1776
| Standard for Binary Floating-Point Arithmetic.
1777
*----------------------------------------------------------------------------*/
1778

    
1779
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1780
{
1781
    int16 aExp, bExp, zExp;
1782
    uint32_t aSig, bSig, zSig;
1783
    int16 expDiff;
1784

    
1785
    aSig = extractFloat32Frac( a );
1786
    aExp = extractFloat32Exp( a );
1787
    bSig = extractFloat32Frac( b );
1788
    bExp = extractFloat32Exp( b );
1789
    expDiff = aExp - bExp;
1790
    aSig <<= 7;
1791
    bSig <<= 7;
1792
    if ( 0 < expDiff ) goto aExpBigger;
1793
    if ( expDiff < 0 ) goto bExpBigger;
1794
    if ( aExp == 0xFF ) {
1795
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1796
        float_raise( float_flag_invalid STATUS_VAR);
1797
        return float32_default_nan;
1798
    }
1799
    if ( aExp == 0 ) {
1800
        aExp = 1;
1801
        bExp = 1;
1802
    }
1803
    if ( bSig < aSig ) goto aBigger;
1804
    if ( aSig < bSig ) goto bBigger;
1805
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1806
 bExpBigger:
1807
    if ( bExp == 0xFF ) {
1808
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1809
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1810
    }
1811
    if ( aExp == 0 ) {
1812
        ++expDiff;
1813
    }
1814
    else {
1815
        aSig |= 0x40000000;
1816
    }
1817
    shift32RightJamming( aSig, - expDiff, &aSig );
1818
    bSig |= 0x40000000;
1819
 bBigger:
1820
    zSig = bSig - aSig;
1821
    zExp = bExp;
1822
    zSign ^= 1;
1823
    goto normalizeRoundAndPack;
1824
 aExpBigger:
1825
    if ( aExp == 0xFF ) {
1826
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827
        return a;
1828
    }
1829
    if ( bExp == 0 ) {
1830
        --expDiff;
1831
    }
1832
    else {
1833
        bSig |= 0x40000000;
1834
    }
1835
    shift32RightJamming( bSig, expDiff, &bSig );
1836
    aSig |= 0x40000000;
1837
 aBigger:
1838
    zSig = aSig - bSig;
1839
    zExp = aExp;
1840
 normalizeRoundAndPack:
1841
    --zExp;
1842
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843

    
1844
}
1845

    
1846
/*----------------------------------------------------------------------------
1847
| Returns the result of adding the single-precision floating-point values `a'
1848
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1849
| Binary Floating-Point Arithmetic.
1850
*----------------------------------------------------------------------------*/
1851

    
1852
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1853
{
1854
    flag aSign, bSign;
1855
    a = float32_squash_input_denormal(a STATUS_VAR);
1856
    b = float32_squash_input_denormal(b STATUS_VAR);
1857

    
1858
    aSign = extractFloat32Sign( a );
1859
    bSign = extractFloat32Sign( b );
1860
    if ( aSign == bSign ) {
1861
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1862
    }
1863
    else {
1864
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865
    }
1866

    
1867
}
1868

    
1869
/*----------------------------------------------------------------------------
1870
| Returns the result of subtracting the single-precision floating-point values
1871
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1872
| for Binary Floating-Point Arithmetic.
1873
*----------------------------------------------------------------------------*/
1874

    
1875
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1876
{
1877
    flag aSign, bSign;
1878
    a = float32_squash_input_denormal(a STATUS_VAR);
1879
    b = float32_squash_input_denormal(b STATUS_VAR);
1880

    
1881
    aSign = extractFloat32Sign( a );
1882
    bSign = extractFloat32Sign( b );
1883
    if ( aSign == bSign ) {
1884
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1885
    }
1886
    else {
1887
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1888
    }
1889

    
1890
}
1891

    
1892
/*----------------------------------------------------------------------------
1893
| Returns the result of multiplying the single-precision floating-point values
1894
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1895
| for Binary Floating-Point Arithmetic.
1896
*----------------------------------------------------------------------------*/
1897

    
1898
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1899
{
1900
    flag aSign, bSign, zSign;
1901
    int16 aExp, bExp, zExp;
1902
    uint32_t aSig, bSig;
1903
    uint64_t zSig64;
1904
    uint32_t zSig;
1905

    
1906
    a = float32_squash_input_denormal(a STATUS_VAR);
1907
    b = float32_squash_input_denormal(b STATUS_VAR);
1908

    
1909
    aSig = extractFloat32Frac( a );
1910
    aExp = extractFloat32Exp( a );
1911
    aSign = extractFloat32Sign( a );
1912
    bSig = extractFloat32Frac( b );
1913
    bExp = extractFloat32Exp( b );
1914
    bSign = extractFloat32Sign( b );
1915
    zSign = aSign ^ bSign;
1916
    if ( aExp == 0xFF ) {
1917
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918
            return propagateFloat32NaN( a, b STATUS_VAR );
1919
        }
1920
        if ( ( bExp | bSig ) == 0 ) {
1921
            float_raise( float_flag_invalid STATUS_VAR);
1922
            return float32_default_nan;
1923
        }
1924
        return packFloat32( zSign, 0xFF, 0 );
1925
    }
1926
    if ( bExp == 0xFF ) {
1927
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1928
        if ( ( aExp | aSig ) == 0 ) {
1929
            float_raise( float_flag_invalid STATUS_VAR);
1930
            return float32_default_nan;
1931
        }
1932
        return packFloat32( zSign, 0xFF, 0 );
1933
    }
1934
    if ( aExp == 0 ) {
1935
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1936
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937
    }
1938
    if ( bExp == 0 ) {
1939
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1940
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941
    }
1942
    zExp = aExp + bExp - 0x7F;
1943
    aSig = ( aSig | 0x00800000 )<<7;
1944
    bSig = ( bSig | 0x00800000 )<<8;
1945
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1946
    zSig = zSig64;
1947
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1948
        zSig <<= 1;
1949
        --zExp;
1950
    }
1951
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1952

    
1953
}
1954

    
1955
/*----------------------------------------------------------------------------
1956
| Returns the result of dividing the single-precision floating-point value `a'
1957
| by the corresponding value `b'.  The operation is performed according to the
1958
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959
*----------------------------------------------------------------------------*/
1960

    
1961
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1962
{
1963
    flag aSign, bSign, zSign;
1964
    int16 aExp, bExp, zExp;
1965
    uint32_t aSig, bSig, zSig;
1966
    a = float32_squash_input_denormal(a STATUS_VAR);
1967
    b = float32_squash_input_denormal(b STATUS_VAR);
1968

    
1969
    aSig = extractFloat32Frac( a );
1970
    aExp = extractFloat32Exp( a );
1971
    aSign = extractFloat32Sign( a );
1972
    bSig = extractFloat32Frac( b );
1973
    bExp = extractFloat32Exp( b );
1974
    bSign = extractFloat32Sign( b );
1975
    zSign = aSign ^ bSign;
1976
    if ( aExp == 0xFF ) {
1977
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1978
        if ( bExp == 0xFF ) {
1979
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980
            float_raise( float_flag_invalid STATUS_VAR);
1981
            return float32_default_nan;
1982
        }
1983
        return packFloat32( zSign, 0xFF, 0 );
1984
    }
1985
    if ( bExp == 0xFF ) {
1986
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987
        return packFloat32( zSign, 0, 0 );
1988
    }
1989
    if ( bExp == 0 ) {
1990
        if ( bSig == 0 ) {
1991
            if ( ( aExp | aSig ) == 0 ) {
1992
                float_raise( float_flag_invalid STATUS_VAR);
1993
                return float32_default_nan;
1994
            }
1995
            float_raise( float_flag_divbyzero STATUS_VAR);
1996
            return packFloat32( zSign, 0xFF, 0 );
1997
        }
1998
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1999
    }
2000
    if ( aExp == 0 ) {
2001
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2002
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2003
    }
2004
    zExp = aExp - bExp + 0x7D;
2005
    aSig = ( aSig | 0x00800000 )<<7;
2006
    bSig = ( bSig | 0x00800000 )<<8;
2007
    if ( bSig <= ( aSig + aSig ) ) {
2008
        aSig >>= 1;
2009
        ++zExp;
2010
    }
2011
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2012
    if ( ( zSig & 0x3F ) == 0 ) {
2013
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2014
    }
2015
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016

    
2017
}
2018

    
2019
/*----------------------------------------------------------------------------
2020
| Returns the remainder of the single-precision floating-point value `a'
2021
| with respect to the corresponding value `b'.  The operation is performed
2022
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023
*----------------------------------------------------------------------------*/
2024

    
2025
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2026
{
2027
    flag aSign, zSign;
2028
    int16 aExp, bExp, expDiff;
2029
    uint32_t aSig, bSig;
2030
    uint32_t q;
2031
    uint64_t aSig64, bSig64, q64;
2032
    uint32_t alternateASig;
2033
    int32_t sigMean;
2034
    a = float32_squash_input_denormal(a STATUS_VAR);
2035
    b = float32_squash_input_denormal(b STATUS_VAR);
2036

    
2037
    aSig = extractFloat32Frac( a );
2038
    aExp = extractFloat32Exp( a );
2039
    aSign = extractFloat32Sign( a );
2040
    bSig = extractFloat32Frac( b );
2041
    bExp = extractFloat32Exp( b );
2042
    if ( aExp == 0xFF ) {
2043
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2044
            return propagateFloat32NaN( a, b STATUS_VAR );
2045
        }
2046
        float_raise( float_flag_invalid STATUS_VAR);
2047
        return float32_default_nan;
2048
    }
2049
    if ( bExp == 0xFF ) {
2050
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2051
        return a;
2052
    }
2053
    if ( bExp == 0 ) {
2054
        if ( bSig == 0 ) {
2055
            float_raise( float_flag_invalid STATUS_VAR);
2056
            return float32_default_nan;
2057
        }
2058
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059
    }
2060
    if ( aExp == 0 ) {
2061
        if ( aSig == 0 ) return a;
2062
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063
    }
2064
    expDiff = aExp - bExp;
2065
    aSig |= 0x00800000;
2066
    bSig |= 0x00800000;
2067
    if ( expDiff < 32 ) {
2068
        aSig <<= 8;
2069
        bSig <<= 8;
2070
        if ( expDiff < 0 ) {
2071
            if ( expDiff < -1 ) return a;
2072
            aSig >>= 1;
2073
        }
2074
        q = ( bSig <= aSig );
2075
        if ( q ) aSig -= bSig;
2076
        if ( 0 < expDiff ) {
2077
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2078
            q >>= 32 - expDiff;
2079
            bSig >>= 2;
2080
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081
        }
2082
        else {
2083
            aSig >>= 2;
2084
            bSig >>= 2;
2085
        }
2086
    }
2087
    else {
2088
        if ( bSig <= aSig ) aSig -= bSig;
2089
        aSig64 = ( (uint64_t) aSig )<<40;
2090
        bSig64 = ( (uint64_t) bSig )<<40;
2091
        expDiff -= 64;
2092
        while ( 0 < expDiff ) {
2093
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2094
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2095
            aSig64 = - ( ( bSig * q64 )<<38 );
2096
            expDiff -= 62;
2097
        }
2098
        expDiff += 64;
2099
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2100
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2101
        q = q64>>( 64 - expDiff );
2102
        bSig <<= 6;
2103
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104
    }
2105
    do {
2106
        alternateASig = aSig;
2107
        ++q;
2108
        aSig -= bSig;
2109
    } while ( 0 <= (int32_t) aSig );
2110
    sigMean = aSig + alternateASig;
2111
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2112
        aSig = alternateASig;
2113
    }
2114
    zSign = ( (int32_t) aSig < 0 );
2115
    if ( zSign ) aSig = - aSig;
2116
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2117

    
2118
}
2119

    
2120
/*----------------------------------------------------------------------------
2121
| Returns the result of multiplying the single-precision floating-point values
2122
| `a' and `b' then adding 'c', with no intermediate rounding step after the
2123
| multiplication.  The operation is performed according to the IEC/IEEE
2124
| Standard for Binary Floating-Point Arithmetic 754-2008.
2125
| The flags argument allows the caller to select negation of the
2126
| addend, the intermediate product, or the final result. (The difference
2127
| between this and having the caller do a separate negation is that negating
2128
| externally will flip the sign bit on NaNs.)
2129
*----------------------------------------------------------------------------*/
2130

    
2131
float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2132
{
2133
    flag aSign, bSign, cSign, zSign;
2134
    int16 aExp, bExp, cExp, pExp, zExp, expDiff;
2135
    uint32_t aSig, bSig, cSig;
2136
    flag pInf, pZero, pSign;
2137
    uint64_t pSig64, cSig64, zSig64;
2138
    uint32_t pSig;
2139
    int shiftcount;
2140
    flag signflip, infzero;
2141

    
2142
    a = float32_squash_input_denormal(a STATUS_VAR);
2143
    b = float32_squash_input_denormal(b STATUS_VAR);
2144
    c = float32_squash_input_denormal(c STATUS_VAR);
2145
    aSig = extractFloat32Frac(a);
2146
    aExp = extractFloat32Exp(a);
2147
    aSign = extractFloat32Sign(a);
2148
    bSig = extractFloat32Frac(b);
2149
    bExp = extractFloat32Exp(b);
2150
    bSign = extractFloat32Sign(b);
2151
    cSig = extractFloat32Frac(c);
2152
    cExp = extractFloat32Exp(c);
2153
    cSign = extractFloat32Sign(c);
2154

    
2155
    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2156
               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2157

    
2158
    /* It is implementation-defined whether the cases of (0,inf,qnan)
2159
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2160
     * they return if they do), so we have to hand this information
2161
     * off to the target-specific pick-a-NaN routine.
2162
     */
2163
    if (((aExp == 0xff) && aSig) ||
2164
        ((bExp == 0xff) && bSig) ||
2165
        ((cExp == 0xff) && cSig)) {
2166
        return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2167
    }
2168

    
2169
    if (infzero) {
2170
        float_raise(float_flag_invalid STATUS_VAR);
2171
        return float32_default_nan;
2172
    }
2173

    
2174
    if (flags & float_muladd_negate_c) {
2175
        cSign ^= 1;
2176
    }
2177

    
2178
    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2179

    
2180
    /* Work out the sign and type of the product */
2181
    pSign = aSign ^ bSign;
2182
    if (flags & float_muladd_negate_product) {
2183
        pSign ^= 1;
2184
    }
2185
    pInf = (aExp == 0xff) || (bExp == 0xff);
2186
    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2187

    
2188
    if (cExp == 0xff) {
2189
        if (pInf && (pSign ^ cSign)) {
2190
            /* addition of opposite-signed infinities => InvalidOperation */
2191
            float_raise(float_flag_invalid STATUS_VAR);
2192
            return float32_default_nan;
2193
        }
2194
        /* Otherwise generate an infinity of the same sign */
2195
        return packFloat32(cSign ^ signflip, 0xff, 0);
2196
    }
2197

    
2198
    if (pInf) {
2199
        return packFloat32(pSign ^ signflip, 0xff, 0);
2200
    }
2201

    
2202
    if (pZero) {
2203
        if (cExp == 0) {
2204
            if (cSig == 0) {
2205
                /* Adding two exact zeroes */
2206
                if (pSign == cSign) {
2207
                    zSign = pSign;
2208
                } else if (STATUS(float_rounding_mode) == float_round_down) {
2209
                    zSign = 1;
2210
                } else {
2211
                    zSign = 0;
2212
                }
2213
                return packFloat32(zSign ^ signflip, 0, 0);
2214
            }
2215
            /* Exact zero plus a denorm */
2216
            if (STATUS(flush_to_zero)) {
2217
                float_raise(float_flag_output_denormal STATUS_VAR);
2218
                return packFloat32(cSign ^ signflip, 0, 0);
2219
            }
2220
        }
2221
        /* Zero plus something non-zero : just return the something */
2222
        return make_float32(float32_val(c) ^ (signflip << 31));
2223
    }
2224

    
2225
    if (aExp == 0) {
2226
        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2227
    }
2228
    if (bExp == 0) {
2229
        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2230
    }
2231

    
2232
    /* Calculate the actual result a * b + c */
2233

    
2234
    /* Multiply first; this is easy. */
2235
    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2236
     * because we want the true exponent, not the "one-less-than"
2237
     * flavour that roundAndPackFloat32() takes.
2238
     */
2239
    pExp = aExp + bExp - 0x7e;
2240
    aSig = (aSig | 0x00800000) << 7;
2241
    bSig = (bSig | 0x00800000) << 8;
2242
    pSig64 = (uint64_t)aSig * bSig;
2243
    if ((int64_t)(pSig64 << 1) >= 0) {
2244
        pSig64 <<= 1;
2245
        pExp--;
2246
    }
2247

    
2248
    zSign = pSign ^ signflip;
2249

    
2250
    /* Now pSig64 is the significand of the multiply, with the explicit bit in
2251
     * position 62.
2252
     */
2253
    if (cExp == 0) {
2254
        if (!cSig) {
2255
            /* Throw out the special case of c being an exact zero now */
2256
            shift64RightJamming(pSig64, 32, &pSig64);
2257
            pSig = pSig64;
2258
            return roundAndPackFloat32(zSign, pExp - 1,
2259
                                       pSig STATUS_VAR);
2260
        }
2261
        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2262
    }
2263

    
2264
    cSig64 = (uint64_t)cSig << (62 - 23);
2265
    cSig64 |= LIT64(0x4000000000000000);
2266
    expDiff = pExp - cExp;
2267

    
2268
    if (pSign == cSign) {
2269
        /* Addition */
2270
        if (expDiff > 0) {
2271
            /* scale c to match p */
2272
            shift64RightJamming(cSig64, expDiff, &cSig64);
2273
            zExp = pExp;
2274
        } else if (expDiff < 0) {
2275
            /* scale p to match c */
2276
            shift64RightJamming(pSig64, -expDiff, &pSig64);
2277
            zExp = cExp;
2278
        } else {
2279
            /* no scaling needed */
2280
            zExp = cExp;
2281
        }
2282
        /* Add significands and make sure explicit bit ends up in posn 62 */
2283
        zSig64 = pSig64 + cSig64;
2284
        if ((int64_t)zSig64 < 0) {
2285
            shift64RightJamming(zSig64, 1, &zSig64);
2286
        } else {
2287
            zExp--;
2288
        }
2289
    } else {
2290
        /* Subtraction */
2291
        if (expDiff > 0) {
2292
            shift64RightJamming(cSig64, expDiff, &cSig64);
2293
            zSig64 = pSig64 - cSig64;
2294
            zExp = pExp;
2295
        } else if (expDiff < 0) {
2296
            shift64RightJamming(pSig64, -expDiff, &pSig64);
2297
            zSig64 = cSig64 - pSig64;
2298
            zExp = cExp;
2299
            zSign ^= 1;
2300
        } else {
2301
            zExp = pExp;
2302
            if (cSig64 < pSig64) {
2303
                zSig64 = pSig64 - cSig64;
2304
            } else if (pSig64 < cSig64) {
2305
                zSig64 = cSig64 - pSig64;
2306
                zSign ^= 1;
2307
            } else {
2308
                /* Exact zero */
2309
                zSign = signflip;
2310
                if (STATUS(float_rounding_mode) == float_round_down) {
2311
                    zSign ^= 1;
2312
                }
2313
                return packFloat32(zSign, 0, 0);
2314
            }
2315
        }
2316
        --zExp;
2317
        /* Normalize to put the explicit bit back into bit 62. */
2318
        shiftcount = countLeadingZeros64(zSig64) - 1;
2319
        zSig64 <<= shiftcount;
2320
        zExp -= shiftcount;
2321
    }
2322
    shift64RightJamming(zSig64, 32, &zSig64);
2323
    return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2324
}
2325

    
2326

    
2327
/*----------------------------------------------------------------------------
2328
| Returns the square root of the single-precision floating-point value `a'.
2329
| The operation is performed according to the IEC/IEEE Standard for Binary
2330
| Floating-Point Arithmetic.
2331
*----------------------------------------------------------------------------*/
2332

    
2333
float32 float32_sqrt( float32 a STATUS_PARAM )
2334
{
2335
    flag aSign;
2336
    int16 aExp, zExp;
2337
    uint32_t aSig, zSig;
2338
    uint64_t rem, term;
2339
    a = float32_squash_input_denormal(a STATUS_VAR);
2340

    
2341
    aSig = extractFloat32Frac( a );
2342
    aExp = extractFloat32Exp( a );
2343
    aSign = extractFloat32Sign( a );
2344
    if ( aExp == 0xFF ) {
2345
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2346
        if ( ! aSign ) return a;
2347
        float_raise( float_flag_invalid STATUS_VAR);
2348
        return float32_default_nan;
2349
    }
2350
    if ( aSign ) {
2351
        if ( ( aExp | aSig ) == 0 ) return a;
2352
        float_raise( float_flag_invalid STATUS_VAR);
2353
        return float32_default_nan;
2354
    }
2355
    if ( aExp == 0 ) {
2356
        if ( aSig == 0 ) return float32_zero;
2357
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2358
    }
2359
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2360
    aSig = ( aSig | 0x00800000 )<<8;
2361
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2362
    if ( ( zSig & 0x7F ) <= 5 ) {
2363
        if ( zSig < 2 ) {
2364
            zSig = 0x7FFFFFFF;
2365
            goto roundAndPack;
2366
        }
2367
        aSig >>= aExp & 1;
2368
        term = ( (uint64_t) zSig ) * zSig;
2369
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2370
        while ( (int64_t) rem < 0 ) {
2371
            --zSig;
2372
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2373
        }
2374
        zSig |= ( rem != 0 );
2375
    }
2376
    shift32RightJamming( zSig, 1, &zSig );
2377
 roundAndPack:
2378
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2379

    
2380
}
2381

    
2382
/*----------------------------------------------------------------------------
2383
| Returns the binary exponential of the single-precision floating-point value
2384
| `a'. The operation is performed according to the IEC/IEEE Standard for
2385
| Binary Floating-Point Arithmetic.
2386
|
2387
| Uses the following identities:
2388
|
2389
| 1. -------------------------------------------------------------------------
2390
|      x    x*ln(2)
2391
|     2  = e
2392
|
2393
| 2. -------------------------------------------------------------------------
2394
|                      2     3     4     5           n
2395
|      x        x     x     x     x     x           x
2396
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2397
|               1!    2!    3!    4!    5!          n!
2398
*----------------------------------------------------------------------------*/
2399

    
2400
static const float64 float32_exp2_coefficients[15] =
2401
{
2402
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2403
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2404
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2405
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2406
    const_float64( 0x3f81111111111111ll ), /*  5 */
2407
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2408
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2409
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2410
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2411
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2412
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2413
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2414
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2415
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2416
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2417
};
2418

    
2419
float32 float32_exp2( float32 a STATUS_PARAM )
2420
{
2421
    flag aSign;
2422
    int16 aExp;
2423
    uint32_t aSig;
2424
    float64 r, x, xn;
2425
    int i;
2426
    a = float32_squash_input_denormal(a STATUS_VAR);
2427

    
2428
    aSig = extractFloat32Frac( a );
2429
    aExp = extractFloat32Exp( a );
2430
    aSign = extractFloat32Sign( a );
2431

    
2432
    if ( aExp == 0xFF) {
2433
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2434
        return (aSign) ? float32_zero : a;
2435
    }
2436
    if (aExp == 0) {
2437
        if (aSig == 0) return float32_one;
2438
    }
2439

    
2440
    float_raise( float_flag_inexact STATUS_VAR);
2441

    
2442
    /* ******************************* */
2443
    /* using float64 for approximation */
2444
    /* ******************************* */
2445
    x = float32_to_float64(a STATUS_VAR);
2446
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2447

    
2448
    xn = x;
2449
    r = float64_one;
2450
    for (i = 0 ; i < 15 ; i++) {
2451
        float64 f;
2452

    
2453
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2454
        r = float64_add(r, f STATUS_VAR);
2455

    
2456
        xn = float64_mul(xn, x STATUS_VAR);
2457
    }
2458

    
2459
    return float64_to_float32(r, status);
2460
}
2461

    
2462
/*----------------------------------------------------------------------------
2463
| Returns the binary log of the single-precision floating-point value `a'.
2464
| The operation is performed according to the IEC/IEEE Standard for Binary
2465
| Floating-Point Arithmetic.
2466
*----------------------------------------------------------------------------*/
2467
float32 float32_log2( float32 a STATUS_PARAM )
2468
{
2469
    flag aSign, zSign;
2470
    int16 aExp;
2471
    uint32_t aSig, zSig, i;
2472

    
2473
    a = float32_squash_input_denormal(a STATUS_VAR);
2474
    aSig = extractFloat32Frac( a );
2475
    aExp = extractFloat32Exp( a );
2476
    aSign = extractFloat32Sign( a );
2477

    
2478
    if ( aExp == 0 ) {
2479
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2480
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2481
    }
2482
    if ( aSign ) {
2483
        float_raise( float_flag_invalid STATUS_VAR);
2484
        return float32_default_nan;
2485
    }
2486
    if ( aExp == 0xFF ) {
2487
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2488
        return a;
2489
    }
2490

    
2491
    aExp -= 0x7F;
2492
    aSig |= 0x00800000;
2493
    zSign = aExp < 0;
2494
    zSig = aExp << 23;
2495

    
2496
    for (i = 1 << 22; i > 0; i >>= 1) {
2497
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2498
        if ( aSig & 0x01000000 ) {
2499
            aSig >>= 1;
2500
            zSig |= i;
2501
        }
2502
    }
2503

    
2504
    if ( zSign )
2505
        zSig = -zSig;
2506

    
2507
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2508
}
2509

    
2510
/*----------------------------------------------------------------------------
2511
| Returns 1 if the single-precision floating-point value `a' is equal to
2512
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2513
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2514
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2515
*----------------------------------------------------------------------------*/
2516

    
2517
int float32_eq( float32 a, float32 b STATUS_PARAM )
2518
{
2519
    uint32_t av, bv;
2520
    a = float32_squash_input_denormal(a STATUS_VAR);
2521
    b = float32_squash_input_denormal(b STATUS_VAR);
2522

    
2523
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2524
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2525
       ) {
2526
        float_raise( float_flag_invalid STATUS_VAR);
2527
        return 0;
2528
    }
2529
    av = float32_val(a);
2530
    bv = float32_val(b);
2531
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2532
}
2533

    
2534
/*----------------------------------------------------------------------------
2535
| Returns 1 if the single-precision floating-point value `a' is less than
2536
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2537
| exception is raised if either operand is a NaN.  The comparison is performed
2538
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2539
*----------------------------------------------------------------------------*/
2540

    
2541
int float32_le( float32 a, float32 b STATUS_PARAM )
2542
{
2543
    flag aSign, bSign;
2544
    uint32_t av, bv;
2545
    a = float32_squash_input_denormal(a STATUS_VAR);
2546
    b = float32_squash_input_denormal(b STATUS_VAR);
2547

    
2548
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2549
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2550
       ) {
2551
        float_raise( float_flag_invalid STATUS_VAR);
2552
        return 0;
2553
    }
2554
    aSign = extractFloat32Sign( a );
2555
    bSign = extractFloat32Sign( b );
2556
    av = float32_val(a);
2557
    bv = float32_val(b);
2558
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2559
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2560

    
2561
}
2562

    
2563
/*----------------------------------------------------------------------------
2564
| Returns 1 if the single-precision floating-point value `a' is less than
2565
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2566
| raised if either operand is a NaN.  The comparison is performed according
2567
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2568
*----------------------------------------------------------------------------*/
2569

    
2570
int float32_lt( float32 a, float32 b STATUS_PARAM )
2571
{
2572
    flag aSign, bSign;
2573
    uint32_t av, bv;
2574
    a = float32_squash_input_denormal(a STATUS_VAR);
2575
    b = float32_squash_input_denormal(b STATUS_VAR);
2576

    
2577
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2578
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2579
       ) {
2580
        float_raise( float_flag_invalid STATUS_VAR);
2581
        return 0;
2582
    }
2583
    aSign = extractFloat32Sign( a );
2584
    bSign = extractFloat32Sign( b );
2585
    av = float32_val(a);
2586
    bv = float32_val(b);
2587
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2588
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2589

    
2590
}
2591

    
2592
/*----------------------------------------------------------------------------
2593
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2594
| be compared, and 0 otherwise.  The invalid exception is raised if either
2595
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2596
| Standard for Binary Floating-Point Arithmetic.
2597
*----------------------------------------------------------------------------*/
2598

    
2599
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2600
{
2601
    a = float32_squash_input_denormal(a STATUS_VAR);
2602
    b = float32_squash_input_denormal(b STATUS_VAR);
2603

    
2604
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2605
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2606
       ) {
2607
        float_raise( float_flag_invalid STATUS_VAR);
2608
        return 1;
2609
    }
2610
    return 0;
2611
}
2612

    
2613
/*----------------------------------------------------------------------------
2614
| Returns 1 if the single-precision floating-point value `a' is equal to
2615
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2616
| exception.  The comparison is performed according to the IEC/IEEE Standard
2617
| for Binary Floating-Point Arithmetic.
2618
*----------------------------------------------------------------------------*/
2619

    
2620
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2621
{
2622
    a = float32_squash_input_denormal(a STATUS_VAR);
2623
    b = float32_squash_input_denormal(b STATUS_VAR);
2624

    
2625
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2626
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2627
       ) {
2628
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2629
            float_raise( float_flag_invalid STATUS_VAR);
2630
        }
2631
        return 0;
2632
    }
2633
    return ( float32_val(a) == float32_val(b) ) ||
2634
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2635
}
2636

    
2637
/*----------------------------------------------------------------------------
2638
| Returns 1 if the single-precision floating-point value `a' is less than or
2639
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2640
| cause an exception.  Otherwise, the comparison is performed according to the
2641
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2642
*----------------------------------------------------------------------------*/
2643

    
2644
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2645
{
2646
    flag aSign, bSign;
2647
    uint32_t av, bv;
2648
    a = float32_squash_input_denormal(a STATUS_VAR);
2649
    b = float32_squash_input_denormal(b STATUS_VAR);
2650

    
2651
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2652
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2653
       ) {
2654
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2655
            float_raise( float_flag_invalid STATUS_VAR);
2656
        }
2657
        return 0;
2658
    }
2659
    aSign = extractFloat32Sign( a );
2660
    bSign = extractFloat32Sign( b );
2661
    av = float32_val(a);
2662
    bv = float32_val(b);
2663
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2664
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2665

    
2666
}
2667

    
2668
/*----------------------------------------------------------------------------
2669
| Returns 1 if the single-precision floating-point value `a' is less than
2670
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2671
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2672
| Standard for Binary Floating-Point Arithmetic.
2673
*----------------------------------------------------------------------------*/
2674

    
2675
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2676
{
2677
    flag aSign, bSign;
2678
    uint32_t av, bv;
2679
    a = float32_squash_input_denormal(a STATUS_VAR);
2680
    b = float32_squash_input_denormal(b STATUS_VAR);
2681

    
2682
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2683
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2684
       ) {
2685
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2686
            float_raise( float_flag_invalid STATUS_VAR);
2687
        }
2688
        return 0;
2689
    }
2690
    aSign = extractFloat32Sign( a );
2691
    bSign = extractFloat32Sign( b );
2692
    av = float32_val(a);
2693
    bv = float32_val(b);
2694
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2695
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2696

    
2697
}
2698

    
2699
/*----------------------------------------------------------------------------
2700
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2701
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2702
| comparison is performed according to the IEC/IEEE Standard for Binary
2703
| Floating-Point Arithmetic.
2704
*----------------------------------------------------------------------------*/
2705

    
2706
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2707
{
2708
    a = float32_squash_input_denormal(a STATUS_VAR);
2709
    b = float32_squash_input_denormal(b STATUS_VAR);
2710

    
2711
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2712
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2713
       ) {
2714
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2715
            float_raise( float_flag_invalid STATUS_VAR);
2716
        }
2717
        return 1;
2718
    }
2719
    return 0;
2720
}
2721

    
2722
/*----------------------------------------------------------------------------
2723
| Returns the result of converting the double-precision floating-point value
2724
| `a' to the 32-bit two's complement integer format.  The conversion is
2725
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2726
| Arithmetic---which means in particular that the conversion is rounded
2727
| according to the current rounding mode.  If `a' is a NaN, the largest
2728
| positive integer is returned.  Otherwise, if the conversion overflows, the
2729
| largest integer with the same sign as `a' is returned.
2730
*----------------------------------------------------------------------------*/
2731

    
2732
int32 float64_to_int32( float64 a STATUS_PARAM )
2733
{
2734
    flag aSign;
2735
    int16 aExp, shiftCount;
2736
    uint64_t aSig;
2737
    a = float64_squash_input_denormal(a STATUS_VAR);
2738

    
2739
    aSig = extractFloat64Frac( a );
2740
    aExp = extractFloat64Exp( a );
2741
    aSign = extractFloat64Sign( a );
2742
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2743
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2744
    shiftCount = 0x42C - aExp;
2745
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2746
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2747

    
2748
}
2749

    
2750
/*----------------------------------------------------------------------------
2751
| Returns the result of converting the double-precision floating-point value
2752
| `a' to the 32-bit two's complement integer format.  The conversion is
2753
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2754
| Arithmetic, except that the conversion is always rounded toward zero.
2755
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2756
| the conversion overflows, the largest integer with the same sign as `a' is
2757
| returned.
2758
*----------------------------------------------------------------------------*/
2759

    
2760
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2761
{
2762
    flag aSign;
2763
    int16 aExp, shiftCount;
2764
    uint64_t aSig, savedASig;
2765
    int32_t z;
2766
    a = float64_squash_input_denormal(a STATUS_VAR);
2767

    
2768
    aSig = extractFloat64Frac( a );
2769
    aExp = extractFloat64Exp( a );
2770
    aSign = extractFloat64Sign( a );
2771
    if ( 0x41E < aExp ) {
2772
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2773
        goto invalid;
2774
    }
2775
    else if ( aExp < 0x3FF ) {
2776
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2777
        return 0;
2778
    }
2779
    aSig |= LIT64( 0x0010000000000000 );
2780
    shiftCount = 0x433 - aExp;
2781
    savedASig = aSig;
2782
    aSig >>= shiftCount;
2783
    z = aSig;
2784
    if ( aSign ) z = - z;
2785
    if ( ( z < 0 ) ^ aSign ) {
2786
 invalid:
2787
        float_raise( float_flag_invalid STATUS_VAR);
2788
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2789
    }
2790
    if ( ( aSig<<shiftCount ) != savedASig ) {
2791
        STATUS(float_exception_flags) |= float_flag_inexact;
2792
    }
2793
    return z;
2794

    
2795
}
2796

    
2797
/*----------------------------------------------------------------------------
2798
| Returns the result of converting the double-precision floating-point value
2799
| `a' to the 16-bit two's complement integer format.  The conversion is
2800
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2801
| Arithmetic, except that the conversion is always rounded toward zero.
2802
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2803
| the conversion overflows, the largest integer with the same sign as `a' is
2804
| returned.
2805
*----------------------------------------------------------------------------*/
2806

    
2807
int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2808
{
2809
    flag aSign;
2810
    int16 aExp, shiftCount;
2811
    uint64_t aSig, savedASig;
2812
    int32 z;
2813

    
2814
    aSig = extractFloat64Frac( a );
2815
    aExp = extractFloat64Exp( a );
2816
    aSign = extractFloat64Sign( a );
2817
    if ( 0x40E < aExp ) {
2818
        if ( ( aExp == 0x7FF ) && aSig ) {
2819
            aSign = 0;
2820
        }
2821
        goto invalid;
2822
    }
2823
    else if ( aExp < 0x3FF ) {
2824
        if ( aExp || aSig ) {
2825
            STATUS(float_exception_flags) |= float_flag_inexact;
2826
        }
2827
        return 0;
2828
    }
2829
    aSig |= LIT64( 0x0010000000000000 );
2830
    shiftCount = 0x433 - aExp;
2831
    savedASig = aSig;
2832
    aSig >>= shiftCount;
2833
    z = aSig;
2834
    if ( aSign ) {
2835
        z = - z;
2836
    }
2837
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2838
 invalid:
2839
        float_raise( float_flag_invalid STATUS_VAR);
2840
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2841
    }
2842
    if ( ( aSig<<shiftCount ) != savedASig ) {
2843
        STATUS(float_exception_flags) |= float_flag_inexact;
2844
    }
2845
    return z;
2846
}
2847

    
2848
/*----------------------------------------------------------------------------
2849
| Returns the result of converting the double-precision floating-point value
2850
| `a' to the 64-bit two's complement integer format.  The conversion is
2851
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2852
| Arithmetic---which means in particular that the conversion is rounded
2853
| according to the current rounding mode.  If `a' is a NaN, the largest
2854
| positive integer is returned.  Otherwise, if the conversion overflows, the
2855
| largest integer with the same sign as `a' is returned.
2856
*----------------------------------------------------------------------------*/
2857

    
2858
int64 float64_to_int64( float64 a STATUS_PARAM )
2859
{
2860
    flag aSign;
2861
    int16 aExp, shiftCount;
2862
    uint64_t aSig, aSigExtra;
2863
    a = float64_squash_input_denormal(a STATUS_VAR);
2864

    
2865
    aSig = extractFloat64Frac( a );
2866
    aExp = extractFloat64Exp( a );
2867
    aSign = extractFloat64Sign( a );
2868
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2869
    shiftCount = 0x433 - aExp;
2870
    if ( shiftCount <= 0 ) {
2871
        if ( 0x43E < aExp ) {
2872
            float_raise( float_flag_invalid STATUS_VAR);
2873
            if (    ! aSign
2874
                 || (    ( aExp == 0x7FF )
2875
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2876
               ) {
2877
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2878
            }
2879
            return (int64_t) LIT64( 0x8000000000000000 );
2880
        }
2881
        aSigExtra = 0;
2882
        aSig <<= - shiftCount;
2883
    }
2884
    else {
2885
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2886
    }
2887
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2888

    
2889
}
2890

    
2891
/*----------------------------------------------------------------------------
2892
| Returns the result of converting the double-precision floating-point value
2893
| `a' to the 64-bit two's complement integer format.  The conversion is
2894
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2895
| Arithmetic, except that the conversion is always rounded toward zero.
2896
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2897
| the conversion overflows, the largest integer with the same sign as `a' is
2898
| returned.
2899
*----------------------------------------------------------------------------*/
2900

    
2901
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2902
{
2903
    flag aSign;
2904
    int16 aExp, shiftCount;
2905
    uint64_t aSig;
2906
    int64 z;
2907
    a = float64_squash_input_denormal(a STATUS_VAR);
2908

    
2909
    aSig = extractFloat64Frac( a );
2910
    aExp = extractFloat64Exp( a );
2911
    aSign = extractFloat64Sign( a );
2912
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2913
    shiftCount = aExp - 0x433;
2914
    if ( 0 <= shiftCount ) {
2915
        if ( 0x43E <= aExp ) {
2916
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2917
                float_raise( float_flag_invalid STATUS_VAR);
2918
                if (    ! aSign
2919
                     || (    ( aExp == 0x7FF )
2920
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2921
                   ) {
2922
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2923
                }
2924
            }
2925
            return (int64_t) LIT64( 0x8000000000000000 );
2926
        }
2927
        z = aSig<<shiftCount;
2928
    }
2929
    else {
2930
        if ( aExp < 0x3FE ) {
2931
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2932
            return 0;
2933
        }
2934
        z = aSig>>( - shiftCount );
2935
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2936
            STATUS(float_exception_flags) |= float_flag_inexact;
2937
        }
2938
    }
2939
    if ( aSign ) z = - z;
2940
    return z;
2941

    
2942
}
2943

    
2944
/*----------------------------------------------------------------------------
2945
| Returns the result of converting the double-precision floating-point value
2946
| `a' to the single-precision floating-point format.  The conversion is
2947
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2948
| Arithmetic.
2949
*----------------------------------------------------------------------------*/
2950

    
2951
float32 float64_to_float32( float64 a STATUS_PARAM )
2952
{
2953
    flag aSign;
2954
    int16 aExp;
2955
    uint64_t aSig;
2956
    uint32_t zSig;
2957
    a = float64_squash_input_denormal(a STATUS_VAR);
2958

    
2959
    aSig = extractFloat64Frac( a );
2960
    aExp = extractFloat64Exp( a );
2961
    aSign = extractFloat64Sign( a );
2962
    if ( aExp == 0x7FF ) {
2963
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2964
        return packFloat32( aSign, 0xFF, 0 );
2965
    }
2966
    shift64RightJamming( aSig, 22, &aSig );
2967
    zSig = aSig;
2968
    if ( aExp || zSig ) {
2969
        zSig |= 0x40000000;
2970
        aExp -= 0x381;
2971
    }
2972
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2973

    
2974
}
2975

    
2976

    
2977
/*----------------------------------------------------------------------------
2978
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2979
| half-precision floating-point value, returning the result.  After being
2980
| shifted into the proper positions, the three fields are simply added
2981
| together to form the result.  This means that any integer portion of `zSig'
2982
| will be added into the exponent.  Since a properly normalized significand
2983
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2984
| than the desired result exponent whenever `zSig' is a complete, normalized
2985
| significand.
2986
*----------------------------------------------------------------------------*/
2987
static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2988
{
2989
    return make_float16(
2990
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2991
}
2992

    
2993
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2994
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2995

    
2996
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2997
{
2998
    flag aSign;
2999
    int16 aExp;
3000
    uint32_t aSig;
3001

    
3002
    aSign = extractFloat16Sign(a);
3003
    aExp = extractFloat16Exp(a);
3004
    aSig = extractFloat16Frac(a);
3005

    
3006
    if (aExp == 0x1f && ieee) {
3007
        if (aSig) {
3008
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3009
        }
3010
        return packFloat32(aSign, 0xff, aSig << 13);
3011
    }
3012
    if (aExp == 0) {
3013
        int8 shiftCount;
3014

    
3015
        if (aSig == 0) {
3016
            return packFloat32(aSign, 0, 0);
3017
        }
3018

    
3019
        shiftCount = countLeadingZeros32( aSig ) - 21;
3020
        aSig = aSig << shiftCount;
3021
        aExp = -shiftCount;
3022
    }
3023
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
3024
}
3025

    
3026
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3027
{
3028
    flag aSign;
3029
    int16 aExp;
3030
    uint32_t aSig;
3031
    uint32_t mask;
3032
    uint32_t increment;
3033
    int8 roundingMode;
3034
    a = float32_squash_input_denormal(a STATUS_VAR);
3035

    
3036
    aSig = extractFloat32Frac( a );
3037
    aExp = extractFloat32Exp( a );
3038
    aSign = extractFloat32Sign( a );
3039
    if ( aExp == 0xFF ) {
3040
        if (aSig) {
3041
            /* Input is a NaN */
3042
            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3043
            if (!ieee) {
3044
                return packFloat16(aSign, 0, 0);
3045
            }
3046
            return r;
3047
        }
3048
        /* Infinity */
3049
        if (!ieee) {
3050
            float_raise(float_flag_invalid STATUS_VAR);
3051
            return packFloat16(aSign, 0x1f, 0x3ff);
3052
        }
3053
        return packFloat16(aSign, 0x1f, 0);
3054
    }
3055
    if (aExp == 0 && aSig == 0) {
3056
        return packFloat16(aSign, 0, 0);
3057
    }
3058
    /* Decimal point between bits 22 and 23.  */
3059
    aSig |= 0x00800000;
3060
    aExp -= 0x7f;
3061
    if (aExp < -14) {
3062
        mask = 0x00ffffff;
3063
        if (aExp >= -24) {
3064
            mask >>= 25 + aExp;
3065
        }
3066
    } else {
3067
        mask = 0x00001fff;
3068
    }
3069
    if (aSig & mask) {
3070
        float_raise( float_flag_underflow STATUS_VAR );
3071
        roundingMode = STATUS(float_rounding_mode);
3072
        switch (roundingMode) {
3073
        case float_round_nearest_even:
3074
            increment = (mask + 1) >> 1;
3075
            if ((aSig & mask) == increment) {
3076
                increment = aSig & (increment << 1);
3077
            }
3078
            break;
3079
        case float_round_up:
3080
            increment = aSign ? 0 : mask;
3081
            break;
3082
        case float_round_down:
3083
            increment = aSign ? mask : 0;
3084
            break;
3085
        default: /* round_to_zero */
3086
            increment = 0;
3087
            break;
3088
        }
3089
        aSig += increment;
3090
        if (aSig >= 0x01000000) {
3091
            aSig >>= 1;
3092
            aExp++;
3093
        }
3094
    } else if (aExp < -14
3095
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
3096
        float_raise( float_flag_underflow STATUS_VAR);
3097
    }
3098

    
3099
    if (ieee) {
3100
        if (aExp > 15) {
3101
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
3102
            return packFloat16(aSign, 0x1f, 0);
3103
        }
3104
    } else {
3105
        if (aExp > 16) {
3106
            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
3107
            return packFloat16(aSign, 0x1f, 0x3ff);
3108
        }
3109
    }
3110
    if (aExp < -24) {
3111
        return packFloat16(aSign, 0, 0);
3112
    }
3113
    if (aExp < -14) {
3114
        aSig >>= -14 - aExp;
3115
        aExp = -14;
3116
    }
3117
    return packFloat16(aSign, aExp + 14, aSig >> 13);
3118
}
3119

    
3120
/*----------------------------------------------------------------------------
3121
| Returns the result of converting the double-precision floating-point value
3122
| `a' to the extended double-precision floating-point format.  The conversion
3123
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3124
| Arithmetic.
3125
*----------------------------------------------------------------------------*/
3126

    
3127
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3128
{
3129
    flag aSign;
3130
    int16 aExp;
3131
    uint64_t aSig;
3132

    
3133
    a = float64_squash_input_denormal(a STATUS_VAR);
3134
    aSig = extractFloat64Frac( a );
3135
    aExp = extractFloat64Exp( a );
3136
    aSign = extractFloat64Sign( a );
3137
    if ( aExp == 0x7FF ) {
3138
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3139
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3140
    }
3141
    if ( aExp == 0 ) {
3142
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3143
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3144
    }
3145
    return
3146
        packFloatx80(
3147
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3148

    
3149
}
3150

    
3151
/*----------------------------------------------------------------------------
3152
| Returns the result of converting the double-precision floating-point value
3153
| `a' to the quadruple-precision floating-point format.  The conversion is
3154
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3155
| Arithmetic.
3156
*----------------------------------------------------------------------------*/
3157

    
3158
float128 float64_to_float128( float64 a STATUS_PARAM )
3159
{
3160
    flag aSign;
3161
    int16 aExp;
3162
    uint64_t aSig, zSig0, zSig1;
3163

    
3164
    a = float64_squash_input_denormal(a STATUS_VAR);
3165
    aSig = extractFloat64Frac( a );
3166
    aExp = extractFloat64Exp( a );
3167
    aSign = extractFloat64Sign( a );
3168
    if ( aExp == 0x7FF ) {
3169
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3170
        return packFloat128( aSign, 0x7FFF, 0, 0 );
3171
    }
3172
    if ( aExp == 0 ) {
3173
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3174
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175
        --aExp;
3176
    }
3177
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3178
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3179

    
3180
}
3181

    
3182
/*----------------------------------------------------------------------------
3183
| Rounds the double-precision floating-point value `a' to an integer, and
3184
| returns the result as a double-precision floating-point value.  The
3185
| operation is performed according to the IEC/IEEE Standard for Binary
3186
| Floating-Point Arithmetic.
3187
*----------------------------------------------------------------------------*/
3188

    
3189
float64 float64_round_to_int( float64 a STATUS_PARAM )
3190
{
3191
    flag aSign;
3192
    int16 aExp;
3193
    uint64_t lastBitMask, roundBitsMask;
3194
    int8 roundingMode;
3195
    uint64_t z;
3196
    a = float64_squash_input_denormal(a STATUS_VAR);
3197

    
3198
    aExp = extractFloat64Exp( a );
3199
    if ( 0x433 <= aExp ) {
3200
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3201
            return propagateFloat64NaN( a, a STATUS_VAR );
3202
        }
3203
        return a;
3204
    }
3205
    if ( aExp < 0x3FF ) {
3206
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3207
        STATUS(float_exception_flags) |= float_flag_inexact;
3208
        aSign = extractFloat64Sign( a );
3209
        switch ( STATUS(float_rounding_mode) ) {
3210
         case float_round_nearest_even:
3211
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3212
                return packFloat64( aSign, 0x3FF, 0 );
3213
            }
3214
            break;
3215
         case float_round_down:
3216
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3217
         case float_round_up:
3218
            return make_float64(
3219
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3220
        }
3221
        return packFloat64( aSign, 0, 0 );
3222
    }
3223
    lastBitMask = 1;
3224
    lastBitMask <<= 0x433 - aExp;
3225
    roundBitsMask = lastBitMask - 1;
3226
    z = float64_val(a);
3227
    roundingMode = STATUS(float_rounding_mode);
3228
    if ( roundingMode == float_round_nearest_even ) {
3229
        z += lastBitMask>>1;
3230
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3231
    }
3232
    else if ( roundingMode != float_round_to_zero ) {
3233
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3234
            z += roundBitsMask;
3235
        }
3236
    }
3237
    z &= ~ roundBitsMask;
3238
    if ( z != float64_val(a) )
3239
        STATUS(float_exception_flags) |= float_flag_inexact;
3240
    return make_float64(z);
3241

    
3242
}
3243

    
3244
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3245
{
3246
    int oldmode;
3247
    float64 res;
3248
    oldmode = STATUS(float_rounding_mode);
3249
    STATUS(float_rounding_mode) = float_round_to_zero;
3250
    res = float64_round_to_int(a STATUS_VAR);
3251
    STATUS(float_rounding_mode) = oldmode;
3252
    return res;
3253
}
3254

    
3255
/*----------------------------------------------------------------------------
3256
| Returns the result of adding the absolute values of the double-precision
3257
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3258
| before being returned.  `zSign' is ignored if the result is a NaN.
3259
| The addition is performed according to the IEC/IEEE Standard for Binary
3260
| Floating-Point Arithmetic.
3261
*----------------------------------------------------------------------------*/
3262

    
3263
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3264
{
3265
    int16 aExp, bExp, zExp;
3266
    uint64_t aSig, bSig, zSig;
3267
    int16 expDiff;
3268

    
3269
    aSig = extractFloat64Frac( a );
3270
    aExp = extractFloat64Exp( a );
3271
    bSig = extractFloat64Frac( b );
3272
    bExp = extractFloat64Exp( b );
3273
    expDiff = aExp - bExp;
3274
    aSig <<= 9;
3275
    bSig <<= 9;
3276
    if ( 0 < expDiff ) {
3277
        if ( aExp == 0x7FF ) {
3278
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3279
            return a;
3280
        }
3281
        if ( bExp == 0 ) {
3282
            --expDiff;
3283
        }
3284
        else {
3285
            bSig |= LIT64( 0x2000000000000000 );
3286
        }
3287
        shift64RightJamming( bSig, expDiff, &bSig );
3288
        zExp = aExp;
3289
    }
3290
    else if ( expDiff < 0 ) {
3291
        if ( bExp == 0x7FF ) {
3292
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3293
            return packFloat64( zSign, 0x7FF, 0 );
3294
        }
3295
        if ( aExp == 0 ) {
3296
            ++expDiff;
3297
        }
3298
        else {
3299
            aSig |= LIT64( 0x2000000000000000 );
3300
        }
3301
        shift64RightJamming( aSig, - expDiff, &aSig );
3302
        zExp = bExp;
3303
    }
3304
    else {
3305
        if ( aExp == 0x7FF ) {
3306
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3307
            return a;
3308
        }
3309
        if ( aExp == 0 ) {
3310
            if (STATUS(flush_to_zero)) {
3311
                if (aSig | bSig) {
3312
                    float_raise(float_flag_output_denormal STATUS_VAR);
3313
                }
3314
                return packFloat64(zSign, 0, 0);
3315
            }
3316
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3317
        }
3318
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3319
        zExp = aExp;
3320
        goto roundAndPack;
3321
    }
3322
    aSig |= LIT64( 0x2000000000000000 );
3323
    zSig = ( aSig + bSig )<<1;
3324
    --zExp;
3325
    if ( (int64_t) zSig < 0 ) {
3326
        zSig = aSig + bSig;
3327
        ++zExp;
3328
    }
3329
 roundAndPack:
3330
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3331

    
3332
}
3333

    
3334
/*----------------------------------------------------------------------------
3335
| Returns the result of subtracting the absolute values of the double-
3336
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3337
| difference is negated before being returned.  `zSign' is ignored if the
3338
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3339
| Standard for Binary Floating-Point Arithmetic.
3340
*----------------------------------------------------------------------------*/
3341

    
3342
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3343
{
3344
    int16 aExp, bExp, zExp;
3345
    uint64_t aSig, bSig, zSig;
3346
    int16 expDiff;
3347

    
3348
    aSig = extractFloat64Frac( a );
3349
    aExp = extractFloat64Exp( a );
3350
    bSig = extractFloat64Frac( b );
3351
    bExp = extractFloat64Exp( b );
3352
    expDiff = aExp - bExp;
3353
    aSig <<= 10;
3354
    bSig <<= 10;
3355
    if ( 0 < expDiff ) goto aExpBigger;
3356
    if ( expDiff < 0 ) goto bExpBigger;
3357
    if ( aExp == 0x7FF ) {
3358
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3359
        float_raise( float_flag_invalid STATUS_VAR);
3360
        return float64_default_nan;
3361
    }
3362
    if ( aExp == 0 ) {
3363
        aExp = 1;
3364
        bExp = 1;
3365
    }
3366
    if ( bSig < aSig ) goto aBigger;
3367
    if ( aSig < bSig ) goto bBigger;
3368
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3369
 bExpBigger:
3370
    if ( bExp == 0x7FF ) {
3371
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3372
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3373
    }
3374
    if ( aExp == 0 ) {
3375
        ++expDiff;
3376
    }
3377
    else {
3378
        aSig |= LIT64( 0x4000000000000000 );
3379
    }
3380
    shift64RightJamming( aSig, - expDiff, &aSig );
3381
    bSig |= LIT64( 0x4000000000000000 );
3382
 bBigger:
3383
    zSig = bSig - aSig;
3384
    zExp = bExp;
3385
    zSign ^= 1;
3386
    goto normalizeRoundAndPack;
3387
 aExpBigger:
3388
    if ( aExp == 0x7FF ) {
3389
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3390
        return a;
3391
    }
3392
    if ( bExp == 0 ) {
3393
        --expDiff;
3394
    }
3395
    else {
3396
        bSig |= LIT64( 0x4000000000000000 );
3397
    }
3398
    shift64RightJamming( bSig, expDiff, &bSig );
3399
    aSig |= LIT64( 0x4000000000000000 );
3400
 aBigger:
3401
    zSig = aSig - bSig;
3402
    zExp = aExp;
3403
 normalizeRoundAndPack:
3404
    --zExp;
3405
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3406

    
3407
}
3408

    
3409
/*----------------------------------------------------------------------------
3410
| Returns the result of adding the double-precision floating-point values `a'
3411
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3412
| Binary Floating-Point Arithmetic.
3413
*----------------------------------------------------------------------------*/
3414

    
3415
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3416
{
3417
    flag aSign, bSign;
3418
    a = float64_squash_input_denormal(a STATUS_VAR);
3419
    b = float64_squash_input_denormal(b STATUS_VAR);
3420

    
3421
    aSign = extractFloat64Sign( a );
3422
    bSign = extractFloat64Sign( b );
3423
    if ( aSign == bSign ) {
3424
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3425
    }
3426
    else {
3427
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3428
    }
3429

    
3430
}
3431

    
3432
/*----------------------------------------------------------------------------
3433
| Returns the result of subtracting the double-precision floating-point values
3434
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3435
| for Binary Floating-Point Arithmetic.
3436
*----------------------------------------------------------------------------*/
3437

    
3438
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3439
{
3440
    flag aSign, bSign;
3441
    a = float64_squash_input_denormal(a STATUS_VAR);
3442
    b = float64_squash_input_denormal(b STATUS_VAR);
3443

    
3444
    aSign = extractFloat64Sign( a );
3445
    bSign = extractFloat64Sign( b );
3446
    if ( aSign == bSign ) {
3447
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3448
    }
3449
    else {
3450
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3451
    }
3452

    
3453
}
3454

    
3455
/*----------------------------------------------------------------------------
3456
| Returns the result of multiplying the double-precision floating-point values
3457
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3458
| for Binary Floating-Point Arithmetic.
3459
*----------------------------------------------------------------------------*/
3460

    
3461
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3462
{
3463
    flag aSign, bSign, zSign;
3464
    int16 aExp, bExp, zExp;
3465
    uint64_t aSig, bSig, zSig0, zSig1;
3466

    
3467
    a = float64_squash_input_denormal(a STATUS_VAR);
3468
    b = float64_squash_input_denormal(b STATUS_VAR);
3469

    
3470
    aSig = extractFloat64Frac( a );
3471
    aExp = extractFloat64Exp( a );
3472
    aSign = extractFloat64Sign( a );
3473
    bSig = extractFloat64Frac( b );
3474
    bExp = extractFloat64Exp( b );
3475
    bSign = extractFloat64Sign( b );
3476
    zSign = aSign ^ bSign;
3477
    if ( aExp == 0x7FF ) {
3478
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3479
            return propagateFloat64NaN( a, b STATUS_VAR );
3480
        }
3481
        if ( ( bExp | bSig ) == 0 ) {
3482
            float_raise( float_flag_invalid STATUS_VAR);
3483
            return float64_default_nan;
3484
        }
3485
        return packFloat64( zSign, 0x7FF, 0 );
3486
    }
3487
    if ( bExp == 0x7FF ) {
3488
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3489
        if ( ( aExp | aSig ) == 0 ) {
3490
            float_raise( float_flag_invalid STATUS_VAR);
3491
            return float64_default_nan;
3492
        }
3493
        return packFloat64( zSign, 0x7FF, 0 );
3494
    }
3495
    if ( aExp == 0 ) {
3496
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3497
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3498
    }
3499
    if ( bExp == 0 ) {
3500
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3501
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3502
    }
3503
    zExp = aExp + bExp - 0x3FF;
3504
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3505
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3506
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3507
    zSig0 |= ( zSig1 != 0 );
3508
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3509
        zSig0 <<= 1;
3510
        --zExp;
3511
    }
3512
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3513

    
3514
}
3515

    
3516
/*----------------------------------------------------------------------------
3517
| Returns the result of dividing the double-precision floating-point value `a'
3518
| by the corresponding value `b'.  The operation is performed according to
3519
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3520
*----------------------------------------------------------------------------*/
3521

    
3522
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3523
{
3524
    flag aSign, bSign, zSign;
3525
    int16 aExp, bExp, zExp;
3526
    uint64_t aSig, bSig, zSig;
3527
    uint64_t rem0, rem1;
3528
    uint64_t term0, term1;
3529
    a = float64_squash_input_denormal(a STATUS_VAR);
3530
    b = float64_squash_input_denormal(b STATUS_VAR);
3531

    
3532
    aSig = extractFloat64Frac( a );
3533
    aExp = extractFloat64Exp( a );
3534
    aSign = extractFloat64Sign( a );
3535
    bSig = extractFloat64Frac( b );
3536
    bExp = extractFloat64Exp( b );
3537
    bSign = extractFloat64Sign( b );
3538
    zSign = aSign ^ bSign;
3539
    if ( aExp == 0x7FF ) {
3540
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3541
        if ( bExp == 0x7FF ) {
3542
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3543
            float_raise( float_flag_invalid STATUS_VAR);
3544
            return float64_default_nan;
3545
        }
3546
        return packFloat64( zSign, 0x7FF, 0 );
3547
    }
3548
    if ( bExp == 0x7FF ) {
3549
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3550
        return packFloat64( zSign, 0, 0 );
3551
    }
3552
    if ( bExp == 0 ) {
3553
        if ( bSig == 0 ) {
3554
            if ( ( aExp | aSig ) == 0 ) {
3555
                float_raise( float_flag_invalid STATUS_VAR);
3556
                return float64_default_nan;
3557
            }
3558
            float_raise( float_flag_divbyzero STATUS_VAR);
3559
            return packFloat64( zSign, 0x7FF, 0 );
3560
        }
3561
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3562
    }
3563
    if ( aExp == 0 ) {
3564
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3565
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3566
    }
3567
    zExp = aExp - bExp + 0x3FD;
3568
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3569
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3570
    if ( bSig <= ( aSig + aSig ) ) {
3571
        aSig >>= 1;
3572
        ++zExp;
3573
    }
3574
    zSig = estimateDiv128To64( aSig, 0, bSig );
3575
    if ( ( zSig & 0x1FF ) <= 2 ) {
3576
        mul64To128( bSig, zSig, &term0, &term1 );
3577
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3578
        while ( (int64_t) rem0 < 0 ) {
3579
            --zSig;
3580
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3581
        }
3582
        zSig |= ( rem1 != 0 );
3583
    }
3584
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3585

    
3586
}
3587

    
3588
/*----------------------------------------------------------------------------
3589
| Returns the remainder of the double-precision floating-point value `a'
3590
| with respect to the corresponding value `b'.  The operation is performed
3591
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3592
*----------------------------------------------------------------------------*/
3593

    
3594
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3595
{
3596
    flag aSign, zSign;
3597
    int16 aExp, bExp, expDiff;
3598
    uint64_t aSig, bSig;
3599
    uint64_t q, alternateASig;
3600
    int64_t sigMean;
3601

    
3602
    a = float64_squash_input_denormal(a STATUS_VAR);
3603
    b = float64_squash_input_denormal(b STATUS_VAR);
3604
    aSig = extractFloat64Frac( a );
3605
    aExp = extractFloat64Exp( a );
3606
    aSign = extractFloat64Sign( a );
3607
    bSig = extractFloat64Frac( b );
3608
    bExp = extractFloat64Exp( b );
3609
    if ( aExp == 0x7FF ) {
3610
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3611
            return propagateFloat64NaN( a, b STATUS_VAR );
3612
        }
3613
        float_raise( float_flag_invalid STATUS_VAR);
3614
        return float64_default_nan;
3615
    }
3616
    if ( bExp == 0x7FF ) {
3617
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3618
        return a;
3619
    }
3620
    if ( bExp == 0 ) {
3621
        if ( bSig == 0 ) {
3622
            float_raise( float_flag_invalid STATUS_VAR);
3623
            return float64_default_nan;
3624
        }
3625
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3626
    }
3627
    if ( aExp == 0 ) {
3628
        if ( aSig == 0 ) return a;
3629
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3630
    }
3631
    expDiff = aExp - bExp;
3632
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3633
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3634
    if ( expDiff < 0 ) {
3635
        if ( expDiff < -1 ) return a;
3636
        aSig >>= 1;
3637
    }
3638
    q = ( bSig <= aSig );
3639
    if ( q ) aSig -= bSig;
3640
    expDiff -= 64;
3641
    while ( 0 < expDiff ) {
3642
        q = estimateDiv128To64( aSig, 0, bSig );
3643
        q = ( 2 < q ) ? q - 2 : 0;
3644
        aSig = - ( ( bSig>>2 ) * q );
3645
        expDiff -= 62;
3646
    }
3647
    expDiff += 64;
3648
    if ( 0 < expDiff ) {
3649
        q = estimateDiv128To64( aSig, 0, bSig );
3650
        q = ( 2 < q ) ? q - 2 : 0;
3651
        q >>= 64 - expDiff;
3652
        bSig >>= 2;
3653
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3654
    }
3655
    else {
3656
        aSig >>= 2;
3657
        bSig >>= 2;
3658
    }
3659
    do {
3660
        alternateASig = aSig;
3661
        ++q;
3662
        aSig -= bSig;
3663
    } while ( 0 <= (int64_t) aSig );
3664
    sigMean = aSig + alternateASig;
3665
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3666
        aSig = alternateASig;
3667
    }
3668
    zSign = ( (int64_t) aSig < 0 );
3669
    if ( zSign ) aSig = - aSig;
3670
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3671

    
3672
}
3673

    
3674
/*----------------------------------------------------------------------------
3675
| Returns the result of multiplying the double-precision floating-point values
3676
| `a' and `b' then adding 'c', with no intermediate rounding step after the
3677
| multiplication.  The operation is performed according to the IEC/IEEE
3678
| Standard for Binary Floating-Point Arithmetic 754-2008.
3679
| The flags argument allows the caller to select negation of the
3680
| addend, the intermediate product, or the final result. (The difference
3681
| between this and having the caller do a separate negation is that negating
3682
| externally will flip the sign bit on NaNs.)
3683
*----------------------------------------------------------------------------*/
3684

    
3685
float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3686
{
3687
    flag aSign, bSign, cSign, zSign;
3688
    int16 aExp, bExp, cExp, pExp, zExp, expDiff;
3689
    uint64_t aSig, bSig, cSig;
3690
    flag pInf, pZero, pSign;
3691
    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3692
    int shiftcount;
3693
    flag signflip, infzero;
3694

    
3695
    a = float64_squash_input_denormal(a STATUS_VAR);
3696
    b = float64_squash_input_denormal(b STATUS_VAR);
3697
    c = float64_squash_input_denormal(c STATUS_VAR);
3698
    aSig = extractFloat64Frac(a);
3699
    aExp = extractFloat64Exp(a);
3700
    aSign = extractFloat64Sign(a);
3701
    bSig = extractFloat64Frac(b);
3702
    bExp = extractFloat64Exp(b);
3703
    bSign = extractFloat64Sign(b);
3704
    cSig = extractFloat64Frac(c);
3705
    cExp = extractFloat64Exp(c);
3706
    cSign = extractFloat64Sign(c);
3707

    
3708
    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3709
               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3710

    
3711
    /* It is implementation-defined whether the cases of (0,inf,qnan)
3712
     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3713
     * they return if they do), so we have to hand this information
3714
     * off to the target-specific pick-a-NaN routine.
3715
     */
3716
    if (((aExp == 0x7ff) && aSig) ||
3717
        ((bExp == 0x7ff) && bSig) ||
3718
        ((cExp == 0x7ff) && cSig)) {
3719
        return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3720
    }
3721

    
3722
    if (infzero) {
3723
        float_raise(float_flag_invalid STATUS_VAR);
3724
        return float64_default_nan;
3725
    }
3726

    
3727
    if (flags & float_muladd_negate_c) {
3728
        cSign ^= 1;
3729
    }
3730

    
3731
    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3732

    
3733
    /* Work out the sign and type of the product */
3734
    pSign = aSign ^ bSign;
3735
    if (flags & float_muladd_negate_product) {
3736
        pSign ^= 1;
3737
    }
3738
    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3739
    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3740

    
3741
    if (cExp == 0x7ff) {
3742
        if (pInf && (pSign ^ cSign)) {
3743
            /* addition of opposite-signed infinities => InvalidOperation */
3744
            float_raise(float_flag_invalid STATUS_VAR);
3745
            return float64_default_nan;
3746
        }
3747
        /* Otherwise generate an infinity of the same sign */
3748
        return packFloat64(cSign ^ signflip, 0x7ff, 0);
3749
    }
3750

    
3751
    if (pInf) {
3752
        return packFloat64(pSign ^ signflip, 0x7ff, 0);
3753
    }
3754

    
3755
    if (pZero) {
3756
        if (cExp == 0) {
3757
            if (cSig == 0) {
3758
                /* Adding two exact zeroes */
3759
                if (pSign == cSign) {
3760
                    zSign = pSign;
3761
                } else if (STATUS(float_rounding_mode) == float_round_down) {
3762
                    zSign = 1;
3763
                } else {
3764
                    zSign = 0;
3765
                }
3766
                return packFloat64(zSign ^ signflip, 0, 0);
3767
            }
3768
            /* Exact zero plus a denorm */
3769
            if (STATUS(flush_to_zero)) {
3770
                float_raise(float_flag_output_denormal STATUS_VAR);
3771
                return packFloat64(cSign ^ signflip, 0, 0);
3772
            }
3773
        }
3774
        /* Zero plus something non-zero : just return the something */
3775
        return make_float64(float64_val(c) ^ ((uint64_t)signflip << 63));
3776
    }
3777

    
3778
    if (aExp == 0) {
3779
        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
3780
    }
3781
    if (bExp == 0) {
3782
        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
3783
    }
3784

    
3785
    /* Calculate the actual result a * b + c */
3786

    
3787
    /* Multiply first; this is easy. */
3788
    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
3789
     * because we want the true exponent, not the "one-less-than"
3790
     * flavour that roundAndPackFloat64() takes.
3791
     */
3792
    pExp = aExp + bExp - 0x3fe;
3793
    aSig = (aSig | LIT64(0x0010000000000000))<<10;
3794
    bSig = (bSig | LIT64(0x0010000000000000))<<11;
3795
    mul64To128(aSig, bSig, &pSig0, &pSig1);
3796
    if ((int64_t)(pSig0 << 1) >= 0) {
3797
        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
3798
        pExp--;
3799
    }
3800

    
3801
    zSign = pSign ^ signflip;
3802

    
3803
    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
3804
     * bit in position 126.
3805
     */
3806
    if (cExp == 0) {
3807
        if (!cSig) {
3808
            /* Throw out the special case of c being an exact zero now */
3809
            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
3810
            return roundAndPackFloat64(zSign, pExp - 1,
3811
                                       pSig1 STATUS_VAR);
3812
        }
3813
        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
3814
    }
3815

    
3816
    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
3817
     * significand of the addend, with the explicit bit in position 126.
3818
     */
3819
    cSig0 = cSig << (126 - 64 - 52);
3820
    cSig1 = 0;
3821
    cSig0 |= LIT64(0x4000000000000000);
3822
    expDiff = pExp - cExp;
3823

    
3824
    if (pSign == cSign) {
3825
        /* Addition */
3826
        if (expDiff > 0) {
3827
            /* scale c to match p */
3828
            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3829
            zExp = pExp;
3830
        } else if (expDiff < 0) {
3831
            /* scale p to match c */
3832
            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3833
            zExp = cExp;
3834
        } else {
3835
            /* no scaling needed */
3836
            zExp = cExp;
3837
        }
3838
        /* Add significands and make sure explicit bit ends up in posn 126 */
3839
        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3840
        if ((int64_t)zSig0 < 0) {
3841
            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
3842
        } else {
3843
            zExp--;
3844
        }
3845
        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
3846
        return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
3847
    } else {
3848
        /* Subtraction */
3849
        if (expDiff > 0) {
3850
            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
3851
            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3852
            zExp = pExp;
3853
        } else if (expDiff < 0) {
3854
            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
3855
            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3856
            zExp = cExp;
3857
            zSign ^= 1;
3858
        } else {
3859
            zExp = pExp;
3860
            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
3861
                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
3862
            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
3863
                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
3864
                zSign ^= 1;
3865
            } else {
3866
                /* Exact zero */
3867
                zSign = signflip;
3868
                if (STATUS(float_rounding_mode) == float_round_down) {
3869
                    zSign ^= 1;
3870
                }
3871
                return packFloat64(zSign, 0, 0);
3872
            }
3873
        }
3874
        --zExp;
3875
        /* Do the equivalent of normalizeRoundAndPackFloat64() but
3876
         * starting with the significand in a pair of uint64_t.
3877
         */
3878
        if (zSig0) {
3879
            shiftcount = countLeadingZeros64(zSig0) - 1;
3880
            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
3881
            if (zSig1) {
3882
                zSig0 |= 1;
3883
            }
3884
            zExp -= shiftcount;
3885
        } else {
3886
            shiftcount = countLeadingZeros64(zSig1) - 1;
3887
            zSig0 = zSig1 << shiftcount;
3888
            zExp -= (shiftcount + 64);
3889
        }
3890
        return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
3891
    }
3892
}
3893

    
3894
/*----------------------------------------------------------------------------
3895
| Returns the square root of the double-precision floating-point value `a'.
3896
| The operation is performed according to the IEC/IEEE Standard for Binary
3897
| Floating-Point Arithmetic.
3898
*----------------------------------------------------------------------------*/
3899

    
3900
float64 float64_sqrt( float64 a STATUS_PARAM )
3901
{
3902
    flag aSign;
3903
    int16 aExp, zExp;
3904
    uint64_t aSig, zSig, doubleZSig;
3905
    uint64_t rem0, rem1, term0, term1;
3906
    a = float64_squash_input_denormal(a STATUS_VAR);
3907

    
3908
    aSig = extractFloat64Frac( a );
3909
    aExp = extractFloat64Exp( a );
3910
    aSign = extractFloat64Sign( a );
3911
    if ( aExp == 0x7FF ) {
3912
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3913
        if ( ! aSign ) return a;
3914
        float_raise( float_flag_invalid STATUS_VAR);
3915
        return float64_default_nan;
3916
    }
3917
    if ( aSign ) {
3918
        if ( ( aExp | aSig ) == 0 ) return a;
3919
        float_raise( float_flag_invalid STATUS_VAR);
3920
        return float64_default_nan;
3921
    }
3922
    if ( aExp == 0 ) {
3923
        if ( aSig == 0 ) return float64_zero;
3924
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3925
    }
3926
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3927
    aSig |= LIT64( 0x0010000000000000 );
3928
    zSig = estimateSqrt32( aExp, aSig>>21 );
3929
    aSig <<= 9 - ( aExp & 1 );
3930
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3931
    if ( ( zSig & 0x1FF ) <= 5 ) {
3932
        doubleZSig = zSig<<1;
3933
        mul64To128( zSig, zSig, &term0, &term1 );
3934
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3935
        while ( (int64_t) rem0 < 0 ) {
3936
            --zSig;
3937
            doubleZSig -= 2;
3938
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3939
        }
3940
        zSig |= ( ( rem0 | rem1 ) != 0 );
3941
    }
3942
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3943

    
3944
}
3945

    
3946
/*----------------------------------------------------------------------------
3947
| Returns the binary log of the double-precision floating-point value `a'.
3948
| The operation is performed according to the IEC/IEEE Standard for Binary
3949
| Floating-Point Arithmetic.
3950
*----------------------------------------------------------------------------*/
3951
float64 float64_log2( float64 a STATUS_PARAM )
3952
{
3953
    flag aSign, zSign;
3954
    int16 aExp;
3955
    uint64_t aSig, aSig0, aSig1, zSig, i;
3956
    a = float64_squash_input_denormal(a STATUS_VAR);
3957

    
3958
    aSig = extractFloat64Frac( a );
3959
    aExp = extractFloat64Exp( a );
3960
    aSign = extractFloat64Sign( a );
3961

    
3962
    if ( aExp == 0 ) {
3963
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3964
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3965
    }
3966
    if ( aSign ) {
3967
        float_raise( float_flag_invalid STATUS_VAR);
3968
        return float64_default_nan;
3969
    }
3970
    if ( aExp == 0x7FF ) {
3971
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3972
        return a;
3973
    }
3974

    
3975
    aExp -= 0x3FF;
3976
    aSig |= LIT64( 0x0010000000000000 );
3977
    zSign = aExp < 0;
3978
    zSig = (uint64_t)aExp << 52;
3979
    for (i = 1LL << 51; i > 0; i >>= 1) {
3980
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3981
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3982
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3983
            aSig >>= 1;
3984
            zSig |= i;
3985
        }
3986
    }
3987

    
3988
    if ( zSign )
3989
        zSig = -zSig;
3990
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3991
}
3992

    
3993
/*----------------------------------------------------------------------------
3994
| Returns 1 if the double-precision floating-point value `a' is equal to the
3995
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3996
| if either operand is a NaN.  Otherwise, the comparison is performed
3997
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998
*----------------------------------------------------------------------------*/
3999

    
4000
int float64_eq( float64 a, float64 b STATUS_PARAM )
4001
{
4002
    uint64_t av, bv;
4003
    a = float64_squash_input_denormal(a STATUS_VAR);
4004
    b = float64_squash_input_denormal(b STATUS_VAR);
4005

    
4006
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4007
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4008
       ) {
4009
        float_raise( float_flag_invalid STATUS_VAR);
4010
        return 0;
4011
    }
4012
    av = float64_val(a);
4013
    bv = float64_val(b);
4014
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4015

    
4016
}
4017

    
4018
/*----------------------------------------------------------------------------
4019
| Returns 1 if the double-precision floating-point value `a' is less than or
4020
| equal to the corresponding value `b', and 0 otherwise.  The invalid
4021
| exception is raised if either operand is a NaN.  The comparison is performed
4022
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4023
*----------------------------------------------------------------------------*/
4024

    
4025
int float64_le( float64 a, float64 b STATUS_PARAM )
4026
{
4027
    flag aSign, bSign;
4028
    uint64_t av, bv;
4029
    a = float64_squash_input_denormal(a STATUS_VAR);
4030
    b = float64_squash_input_denormal(b STATUS_VAR);
4031

    
4032
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4033
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4034
       ) {
4035
        float_raise( float_flag_invalid STATUS_VAR);
4036
        return 0;
4037
    }
4038
    aSign = extractFloat64Sign( a );
4039
    bSign = extractFloat64Sign( b );
4040
    av = float64_val(a);
4041
    bv = float64_val(b);
4042
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4043
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4044

    
4045
}
4046

    
4047
/*----------------------------------------------------------------------------
4048
| Returns 1 if the double-precision floating-point value `a' is less than
4049
| the corresponding value `b', and 0 otherwise.  The invalid exception is
4050
| raised if either operand is a NaN.  The comparison is performed according
4051
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4052
*----------------------------------------------------------------------------*/
4053

    
4054
int float64_lt( float64 a, float64 b STATUS_PARAM )
4055
{
4056
    flag aSign, bSign;
4057
    uint64_t av, bv;
4058

    
4059
    a = float64_squash_input_denormal(a STATUS_VAR);
4060
    b = float64_squash_input_denormal(b STATUS_VAR);
4061
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4062
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4063
       ) {
4064
        float_raise( float_flag_invalid STATUS_VAR);
4065
        return 0;
4066
    }
4067
    aSign = extractFloat64Sign( a );
4068
    bSign = extractFloat64Sign( b );
4069
    av = float64_val(a);
4070
    bv = float64_val(b);
4071
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4072
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4073

    
4074
}
4075

    
4076
/*----------------------------------------------------------------------------
4077
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4078
| be compared, and 0 otherwise.  The invalid exception is raised if either
4079
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
4080
| Standard for Binary Floating-Point Arithmetic.
4081
*----------------------------------------------------------------------------*/
4082

    
4083
int float64_unordered( float64 a, float64 b STATUS_PARAM )
4084
{
4085
    a = float64_squash_input_denormal(a STATUS_VAR);
4086
    b = float64_squash_input_denormal(b STATUS_VAR);
4087

    
4088
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4089
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4090
       ) {
4091
        float_raise( float_flag_invalid STATUS_VAR);
4092
        return 1;
4093
    }
4094
    return 0;
4095
}
4096

    
4097
/*----------------------------------------------------------------------------
4098
| Returns 1 if the double-precision floating-point value `a' is equal to the
4099
| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4100
| exception.The comparison is performed according to the IEC/IEEE Standard
4101
| for Binary Floating-Point Arithmetic.
4102
*----------------------------------------------------------------------------*/
4103

    
4104
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4105
{
4106
    uint64_t av, bv;
4107
    a = float64_squash_input_denormal(a STATUS_VAR);
4108
    b = float64_squash_input_denormal(b STATUS_VAR);
4109

    
4110
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4111
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4112
       ) {
4113
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4114
            float_raise( float_flag_invalid STATUS_VAR);
4115
        }
4116
        return 0;
4117
    }
4118
    av = float64_val(a);
4119
    bv = float64_val(b);
4120
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4121

    
4122
}
4123

    
4124
/*----------------------------------------------------------------------------
4125
| Returns 1 if the double-precision floating-point value `a' is less than or
4126
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4127
| cause an exception.  Otherwise, the comparison is performed according to the
4128
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4129
*----------------------------------------------------------------------------*/
4130

    
4131
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4132
{
4133
    flag aSign, bSign;
4134
    uint64_t av, bv;
4135
    a = float64_squash_input_denormal(a STATUS_VAR);
4136
    b = float64_squash_input_denormal(b STATUS_VAR);
4137

    
4138
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4139
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4140
       ) {
4141
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4142
            float_raise( float_flag_invalid STATUS_VAR);
4143
        }
4144
        return 0;
4145
    }
4146
    aSign = extractFloat64Sign( a );
4147
    bSign = extractFloat64Sign( b );
4148
    av = float64_val(a);
4149
    bv = float64_val(b);
4150
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4151
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
4152

    
4153
}
4154

    
4155
/*----------------------------------------------------------------------------
4156
| Returns 1 if the double-precision floating-point value `a' is less than
4157
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4158
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4159
| Standard for Binary Floating-Point Arithmetic.
4160
*----------------------------------------------------------------------------*/
4161

    
4162
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4163
{
4164
    flag aSign, bSign;
4165
    uint64_t av, bv;
4166
    a = float64_squash_input_denormal(a STATUS_VAR);
4167
    b = float64_squash_input_denormal(b STATUS_VAR);
4168

    
4169
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4170
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4171
       ) {
4172
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4173
            float_raise( float_flag_invalid STATUS_VAR);
4174
        }
4175
        return 0;
4176
    }
4177
    aSign = extractFloat64Sign( a );
4178
    bSign = extractFloat64Sign( b );
4179
    av = float64_val(a);
4180
    bv = float64_val(b);
4181
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4182
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
4183

    
4184
}
4185

    
4186
/*----------------------------------------------------------------------------
4187
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4188
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4189
| comparison is performed according to the IEC/IEEE Standard for Binary
4190
| Floating-Point Arithmetic.
4191
*----------------------------------------------------------------------------*/
4192

    
4193
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4194
{
4195
    a = float64_squash_input_denormal(a STATUS_VAR);
4196
    b = float64_squash_input_denormal(b STATUS_VAR);
4197

    
4198
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4199
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4200
       ) {
4201
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4202
            float_raise( float_flag_invalid STATUS_VAR);
4203
        }
4204
        return 1;
4205
    }
4206
    return 0;
4207
}
4208

    
4209
/*----------------------------------------------------------------------------
4210
| Returns the result of converting the extended double-precision floating-
4211
| point value `a' to the 32-bit two's complement integer format.  The
4212
| conversion is performed according to the IEC/IEEE Standard for Binary
4213
| Floating-Point Arithmetic---which means in particular that the conversion
4214
| is rounded according to the current rounding mode.  If `a' is a NaN, the
4215
| largest positive integer is returned.  Otherwise, if the conversion
4216
| overflows, the largest integer with the same sign as `a' is returned.
4217
*----------------------------------------------------------------------------*/
4218

    
4219
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4220
{
4221
    flag aSign;
4222
    int32 aExp, shiftCount;
4223
    uint64_t aSig;
4224

    
4225
    aSig = extractFloatx80Frac( a );
4226
    aExp = extractFloatx80Exp( a );
4227
    aSign = extractFloatx80Sign( a );
4228
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4229
    shiftCount = 0x4037 - aExp;
4230
    if ( shiftCount <= 0 ) shiftCount = 1;
4231
    shift64RightJamming( aSig, shiftCount, &aSig );
4232
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
4233

    
4234
}
4235

    
4236
/*----------------------------------------------------------------------------
4237
| Returns the result of converting the extended double-precision floating-
4238
| point value `a' to the 32-bit two's complement integer format.  The
4239
| conversion is performed according to the IEC/IEEE Standard for Binary
4240
| Floating-Point Arithmetic, except that the conversion is always rounded
4241
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4242
| Otherwise, if the conversion overflows, the largest integer with the same
4243
| sign as `a' is returned.
4244
*----------------------------------------------------------------------------*/
4245

    
4246
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4247
{
4248
    flag aSign;
4249
    int32 aExp, shiftCount;
4250
    uint64_t aSig, savedASig;
4251
    int32_t z;
4252

    
4253
    aSig = extractFloatx80Frac( a );
4254
    aExp = extractFloatx80Exp( a );
4255
    aSign = extractFloatx80Sign( a );
4256
    if ( 0x401E < aExp ) {
4257
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4258
        goto invalid;
4259
    }
4260
    else if ( aExp < 0x3FFF ) {
4261
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4262
        return 0;
4263
    }
4264
    shiftCount = 0x403E - aExp;
4265
    savedASig = aSig;
4266
    aSig >>= shiftCount;
4267
    z = aSig;
4268
    if ( aSign ) z = - z;
4269
    if ( ( z < 0 ) ^ aSign ) {
4270
 invalid:
4271
        float_raise( float_flag_invalid STATUS_VAR);
4272
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4273
    }
4274
    if ( ( aSig<<shiftCount ) != savedASig ) {
4275
        STATUS(float_exception_flags) |= float_flag_inexact;
4276
    }
4277
    return z;
4278

    
4279
}
4280

    
4281
/*----------------------------------------------------------------------------
4282
| Returns the result of converting the extended double-precision floating-
4283
| point value `a' to the 64-bit two's complement integer format.  The
4284
| conversion is performed according to the IEC/IEEE Standard for Binary
4285
| Floating-Point Arithmetic---which means in particular that the conversion
4286
| is rounded according to the current rounding mode.  If `a' is a NaN,
4287
| the largest positive integer is returned.  Otherwise, if the conversion
4288
| overflows, the largest integer with the same sign as `a' is returned.
4289
*----------------------------------------------------------------------------*/
4290

    
4291
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4292
{
4293
    flag aSign;
4294
    int32 aExp, shiftCount;
4295
    uint64_t aSig, aSigExtra;
4296

    
4297
    aSig = extractFloatx80Frac( a );
4298
    aExp = extractFloatx80Exp( a );
4299
    aSign = extractFloatx80Sign( a );
4300
    shiftCount = 0x403E - aExp;
4301
    if ( shiftCount <= 0 ) {
4302
        if ( shiftCount ) {
4303
            float_raise( float_flag_invalid STATUS_VAR);
4304
            if (    ! aSign
4305
                 || (    ( aExp == 0x7FFF )
4306
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
4307
               ) {
4308
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4309
            }
4310
            return (int64_t) LIT64( 0x8000000000000000 );
4311
        }
4312
        aSigExtra = 0;
4313
    }
4314
    else {
4315
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4316
    }
4317
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4318

    
4319
}
4320

    
4321
/*----------------------------------------------------------------------------
4322
| Returns the result of converting the extended double-precision floating-
4323
| point value `a' to the 64-bit two's complement integer format.  The
4324
| conversion is performed according to the IEC/IEEE Standard for Binary
4325
| Floating-Point Arithmetic, except that the conversion is always rounded
4326
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
4327
| Otherwise, if the conversion overflows, the largest integer with the same
4328
| sign as `a' is returned.
4329
*----------------------------------------------------------------------------*/
4330

    
4331
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4332
{
4333
    flag aSign;
4334
    int32 aExp, shiftCount;
4335
    uint64_t aSig;
4336
    int64 z;
4337

    
4338
    aSig = extractFloatx80Frac( a );
4339
    aExp = extractFloatx80Exp( a );
4340
    aSign = extractFloatx80Sign( a );
4341
    shiftCount = aExp - 0x403E;
4342
    if ( 0 <= shiftCount ) {
4343
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4344
        if ( ( a.high != 0xC03E ) || aSig ) {
4345
            float_raise( float_flag_invalid STATUS_VAR);
4346
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4347
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4348
            }
4349
        }
4350
        return (int64_t) LIT64( 0x8000000000000000 );
4351
    }
4352
    else if ( aExp < 0x3FFF ) {
4353
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4354
        return 0;
4355
    }
4356
    z = aSig>>( - shiftCount );
4357
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4358
        STATUS(float_exception_flags) |= float_flag_inexact;
4359
    }
4360
    if ( aSign ) z = - z;
4361
    return z;
4362

    
4363
}
4364

    
4365
/*----------------------------------------------------------------------------
4366
| Returns the result of converting the extended double-precision floating-
4367
| point value `a' to the single-precision floating-point format.  The
4368
| conversion is performed according to the IEC/IEEE Standard for Binary
4369
| Floating-Point Arithmetic.
4370
*----------------------------------------------------------------------------*/
4371

    
4372
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4373
{
4374
    flag aSign;
4375
    int32 aExp;
4376
    uint64_t aSig;
4377

    
4378
    aSig = extractFloatx80Frac( a );
4379
    aExp = extractFloatx80Exp( a );
4380
    aSign = extractFloatx80Sign( a );
4381
    if ( aExp == 0x7FFF ) {
4382
        if ( (uint64_t) ( aSig<<1 ) ) {
4383
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4384
        }
4385
        return packFloat32( aSign, 0xFF, 0 );
4386
    }
4387
    shift64RightJamming( aSig, 33, &aSig );
4388
    if ( aExp || aSig ) aExp -= 0x3F81;
4389
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4390

    
4391
}
4392

    
4393
/*----------------------------------------------------------------------------
4394
| Returns the result of converting the extended double-precision floating-
4395
| point value `a' to the double-precision floating-point format.  The
4396
| conversion is performed according to the IEC/IEEE Standard for Binary
4397
| Floating-Point Arithmetic.
4398
*----------------------------------------------------------------------------*/
4399

    
4400
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4401
{
4402
    flag aSign;
4403
    int32 aExp;
4404
    uint64_t aSig, zSig;
4405

    
4406
    aSig = extractFloatx80Frac( a );
4407
    aExp = extractFloatx80Exp( a );
4408
    aSign = extractFloatx80Sign( a );
4409
    if ( aExp == 0x7FFF ) {
4410
        if ( (uint64_t) ( aSig<<1 ) ) {
4411
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4412
        }
4413
        return packFloat64( aSign, 0x7FF, 0 );
4414
    }
4415
    shift64RightJamming( aSig, 1, &zSig );
4416
    if ( aExp || aSig ) aExp -= 0x3C01;
4417
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4418

    
4419
}
4420

    
4421
/*----------------------------------------------------------------------------
4422
| Returns the result of converting the extended double-precision floating-
4423
| point value `a' to the quadruple-precision floating-point format.  The
4424
| conversion is performed according to the IEC/IEEE Standard for Binary
4425
| Floating-Point Arithmetic.
4426
*----------------------------------------------------------------------------*/
4427

    
4428
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4429
{
4430
    flag aSign;
4431
    int16 aExp;
4432
    uint64_t aSig, zSig0, zSig1;
4433

    
4434
    aSig = extractFloatx80Frac( a );
4435
    aExp = extractFloatx80Exp( a );
4436
    aSign = extractFloatx80Sign( a );
4437
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4438
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4439
    }
4440
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4441
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4442

    
4443
}
4444

    
4445
/*----------------------------------------------------------------------------
4446
| Rounds the extended double-precision floating-point value `a' to an integer,
4447
| and returns the result as an extended quadruple-precision floating-point
4448
| value.  The operation is performed according to the IEC/IEEE Standard for
4449
| Binary Floating-Point Arithmetic.
4450
*----------------------------------------------------------------------------*/
4451

    
4452
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4453
{
4454
    flag aSign;
4455
    int32 aExp;
4456
    uint64_t lastBitMask, roundBitsMask;
4457
    int8 roundingMode;
4458
    floatx80 z;
4459

    
4460
    aExp = extractFloatx80Exp( a );
4461
    if ( 0x403E <= aExp ) {
4462
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4463
            return propagateFloatx80NaN( a, a STATUS_VAR );
4464
        }
4465
        return a;
4466
    }
4467
    if ( aExp < 0x3FFF ) {
4468
        if (    ( aExp == 0 )
4469
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4470
            return a;
4471
        }
4472
        STATUS(float_exception_flags) |= float_flag_inexact;
4473
        aSign = extractFloatx80Sign( a );
4474
        switch ( STATUS(float_rounding_mode) ) {
4475
         case float_round_nearest_even:
4476
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4477
               ) {
4478
                return
4479
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4480
            }
4481
            break;
4482
         case float_round_down:
4483
            return
4484
                  aSign ?
4485
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4486
                : packFloatx80( 0, 0, 0 );
4487
         case float_round_up:
4488
            return
4489
                  aSign ? packFloatx80( 1, 0, 0 )
4490
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4491
        }
4492
        return packFloatx80( aSign, 0, 0 );
4493
    }
4494
    lastBitMask = 1;
4495
    lastBitMask <<= 0x403E - aExp;
4496
    roundBitsMask = lastBitMask - 1;
4497
    z = a;
4498
    roundingMode = STATUS(float_rounding_mode);
4499
    if ( roundingMode == float_round_nearest_even ) {
4500
        z.low += lastBitMask>>1;
4501
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4502
    }
4503
    else if ( roundingMode != float_round_to_zero ) {
4504
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4505
            z.low += roundBitsMask;
4506
        }
4507
    }
4508
    z.low &= ~ roundBitsMask;
4509
    if ( z.low == 0 ) {
4510
        ++z.high;
4511
        z.low = LIT64( 0x8000000000000000 );
4512
    }
4513
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4514
    return z;
4515

    
4516
}
4517

    
4518
/*----------------------------------------------------------------------------
4519
| Returns the result of adding the absolute values of the extended double-
4520
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4521
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4522
| The addition is performed according to the IEC/IEEE Standard for Binary
4523
| Floating-Point Arithmetic.
4524
*----------------------------------------------------------------------------*/
4525

    
4526
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4527
{
4528
    int32 aExp, bExp, zExp;
4529
    uint64_t aSig, bSig, zSig0, zSig1;
4530
    int32 expDiff;
4531

    
4532
    aSig = extractFloatx80Frac( a );
4533
    aExp = extractFloatx80Exp( a );
4534
    bSig = extractFloatx80Frac( b );
4535
    bExp = extractFloatx80Exp( b );
4536
    expDiff = aExp - bExp;
4537
    if ( 0 < expDiff ) {
4538
        if ( aExp == 0x7FFF ) {
4539
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4540
            return a;
4541
        }
4542
        if ( bExp == 0 ) --expDiff;
4543
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4544
        zExp = aExp;
4545
    }
4546
    else if ( expDiff < 0 ) {
4547
        if ( bExp == 0x7FFF ) {
4548
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4549
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4550
        }
4551
        if ( aExp == 0 ) ++expDiff;
4552
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4553
        zExp = bExp;
4554
    }
4555
    else {
4556
        if ( aExp == 0x7FFF ) {
4557
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4558
                return propagateFloatx80NaN( a, b STATUS_VAR );
4559
            }
4560
            return a;
4561
        }
4562
        zSig1 = 0;
4563
        zSig0 = aSig + bSig;
4564
        if ( aExp == 0 ) {
4565
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4566
            goto roundAndPack;
4567
        }
4568
        zExp = aExp;
4569
        goto shiftRight1;
4570
    }
4571
    zSig0 = aSig + bSig;
4572
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4573
 shiftRight1:
4574
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4575
    zSig0 |= LIT64( 0x8000000000000000 );
4576
    ++zExp;
4577
 roundAndPack:
4578
    return
4579
        roundAndPackFloatx80(
4580
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4581

    
4582
}
4583

    
4584
/*----------------------------------------------------------------------------
4585
| Returns the result of subtracting the absolute values of the extended
4586
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4587
| difference is negated before being returned.  `zSign' is ignored if the
4588
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4589
| Standard for Binary Floating-Point Arithmetic.
4590
*----------------------------------------------------------------------------*/
4591

    
4592
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4593
{
4594
    int32 aExp, bExp, zExp;
4595
    uint64_t aSig, bSig, zSig0, zSig1;
4596
    int32 expDiff;
4597
    floatx80 z;
4598

    
4599
    aSig = extractFloatx80Frac( a );
4600
    aExp = extractFloatx80Exp( a );
4601
    bSig = extractFloatx80Frac( b );
4602
    bExp = extractFloatx80Exp( b );
4603
    expDiff = aExp - bExp;
4604
    if ( 0 < expDiff ) goto aExpBigger;
4605
    if ( expDiff < 0 ) goto bExpBigger;
4606
    if ( aExp == 0x7FFF ) {
4607
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4608
            return propagateFloatx80NaN( a, b STATUS_VAR );
4609
        }
4610
        float_raise( float_flag_invalid STATUS_VAR);
4611
        z.low = floatx80_default_nan_low;
4612
        z.high = floatx80_default_nan_high;
4613
        return z;
4614
    }
4615
    if ( aExp == 0 ) {
4616
        aExp = 1;
4617
        bExp = 1;
4618
    }
4619
    zSig1 = 0;
4620
    if ( bSig < aSig ) goto aBigger;
4621
    if ( aSig < bSig ) goto bBigger;
4622
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4623
 bExpBigger:
4624
    if ( bExp == 0x7FFF ) {
4625
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4626
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4627
    }
4628
    if ( aExp == 0 ) ++expDiff;
4629
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4630
 bBigger:
4631
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4632
    zExp = bExp;
4633
    zSign ^= 1;
4634
    goto normalizeRoundAndPack;
4635
 aExpBigger:
4636
    if ( aExp == 0x7FFF ) {
4637
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4638
        return a;
4639
    }
4640
    if ( bExp == 0 ) --expDiff;
4641
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4642
 aBigger:
4643
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4644
    zExp = aExp;
4645
 normalizeRoundAndPack:
4646
    return
4647
        normalizeRoundAndPackFloatx80(
4648
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4649

    
4650
}
4651

    
4652
/*----------------------------------------------------------------------------
4653
| Returns the result of adding the extended double-precision floating-point
4654
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4655
| Standard for Binary Floating-Point Arithmetic.
4656
*----------------------------------------------------------------------------*/
4657

    
4658
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4659
{
4660
    flag aSign, bSign;
4661

    
4662
    aSign = extractFloatx80Sign( a );
4663
    bSign = extractFloatx80Sign( b );
4664
    if ( aSign == bSign ) {
4665
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4666
    }
4667
    else {
4668
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4669
    }
4670

    
4671
}
4672

    
4673
/*----------------------------------------------------------------------------
4674
| Returns the result of subtracting the extended double-precision floating-
4675
| point values `a' and `b'.  The operation is performed according to the
4676
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4677
*----------------------------------------------------------------------------*/
4678

    
4679
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4680
{
4681
    flag aSign, bSign;
4682

    
4683
    aSign = extractFloatx80Sign( a );
4684
    bSign = extractFloatx80Sign( b );
4685
    if ( aSign == bSign ) {
4686
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4687
    }
4688
    else {
4689
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4690
    }
4691

    
4692
}
4693

    
4694
/*----------------------------------------------------------------------------
4695
| Returns the result of multiplying the extended double-precision floating-
4696
| point values `a' and `b'.  The operation is performed according to the
4697
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4698
*----------------------------------------------------------------------------*/
4699

    
4700
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4701
{
4702
    flag aSign, bSign, zSign;
4703
    int32 aExp, bExp, zExp;
4704
    uint64_t aSig, bSig, zSig0, zSig1;
4705
    floatx80 z;
4706

    
4707
    aSig = extractFloatx80Frac( a );
4708
    aExp = extractFloatx80Exp( a );
4709
    aSign = extractFloatx80Sign( a );
4710
    bSig = extractFloatx80Frac( b );
4711
    bExp = extractFloatx80Exp( b );
4712
    bSign = extractFloatx80Sign( b );
4713
    zSign = aSign ^ bSign;
4714
    if ( aExp == 0x7FFF ) {
4715
        if (    (uint64_t) ( aSig<<1 )
4716
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4717
            return propagateFloatx80NaN( a, b STATUS_VAR );
4718
        }
4719
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4720
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4721
    }
4722
    if ( bExp == 0x7FFF ) {
4723
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4724
        if ( ( aExp | aSig ) == 0 ) {
4725
 invalid:
4726
            float_raise( float_flag_invalid STATUS_VAR);
4727
            z.low = floatx80_default_nan_low;
4728
            z.high = floatx80_default_nan_high;
4729
            return z;
4730
        }
4731
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4732
    }
4733
    if ( aExp == 0 ) {
4734
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4735
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4736
    }
4737
    if ( bExp == 0 ) {
4738
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4739
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4740
    }
4741
    zExp = aExp + bExp - 0x3FFE;
4742
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4743
    if ( 0 < (int64_t) zSig0 ) {
4744
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4745
        --zExp;
4746
    }
4747
    return
4748
        roundAndPackFloatx80(
4749
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4750

    
4751
}
4752

    
4753
/*----------------------------------------------------------------------------
4754
| Returns the result of dividing the extended double-precision floating-point
4755
| value `a' by the corresponding value `b'.  The operation is performed
4756
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4757
*----------------------------------------------------------------------------*/
4758

    
4759
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4760
{
4761
    flag aSign, bSign, zSign;
4762
    int32 aExp, bExp, zExp;
4763
    uint64_t aSig, bSig, zSig0, zSig1;
4764
    uint64_t rem0, rem1, rem2, term0, term1, term2;
4765
    floatx80 z;
4766

    
4767
    aSig = extractFloatx80Frac( a );
4768
    aExp = extractFloatx80Exp( a );
4769
    aSign = extractFloatx80Sign( a );
4770
    bSig = extractFloatx80Frac( b );
4771
    bExp = extractFloatx80Exp( b );
4772
    bSign = extractFloatx80Sign( b );
4773
    zSign = aSign ^ bSign;
4774
    if ( aExp == 0x7FFF ) {
4775
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4776
        if ( bExp == 0x7FFF ) {
4777
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4778
            goto invalid;
4779
        }
4780
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4781
    }
4782
    if ( bExp == 0x7FFF ) {
4783
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4784
        return packFloatx80( zSign, 0, 0 );
4785
    }
4786
    if ( bExp == 0 ) {
4787
        if ( bSig == 0 ) {
4788
            if ( ( aExp | aSig ) == 0 ) {
4789
 invalid:
4790
                float_raise( float_flag_invalid STATUS_VAR);
4791
                z.low = floatx80_default_nan_low;
4792
                z.high = floatx80_default_nan_high;
4793
                return z;
4794
            }
4795
            float_raise( float_flag_divbyzero STATUS_VAR);
4796
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4797
        }
4798
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4799
    }
4800
    if ( aExp == 0 ) {
4801
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4802
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4803
    }
4804
    zExp = aExp - bExp + 0x3FFE;
4805
    rem1 = 0;
4806
    if ( bSig <= aSig ) {
4807
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4808
        ++zExp;
4809
    }
4810
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4811
    mul64To128( bSig, zSig0, &term0, &term1 );
4812
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4813
    while ( (int64_t) rem0 < 0 ) {
4814
        --zSig0;
4815
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4816
    }
4817
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4818
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4819
        mul64To128( bSig, zSig1, &term1, &term2 );
4820
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4821
        while ( (int64_t) rem1 < 0 ) {
4822
            --zSig1;
4823
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4824
        }
4825
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4826
    }
4827
    return
4828
        roundAndPackFloatx80(
4829
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4830

    
4831
}
4832

    
4833
/*----------------------------------------------------------------------------
4834
| Returns the remainder of the extended double-precision floating-point value
4835
| `a' with respect to the corresponding value `b'.  The operation is performed
4836
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4837
*----------------------------------------------------------------------------*/
4838

    
4839
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4840
{
4841
    flag aSign, zSign;
4842
    int32 aExp, bExp, expDiff;
4843
    uint64_t aSig0, aSig1, bSig;
4844
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4845
    floatx80 z;
4846

    
4847
    aSig0 = extractFloatx80Frac( a );
4848
    aExp = extractFloatx80Exp( a );
4849
    aSign = extractFloatx80Sign( a );
4850
    bSig = extractFloatx80Frac( b );
4851
    bExp = extractFloatx80Exp( b );
4852
    if ( aExp == 0x7FFF ) {
4853
        if (    (uint64_t) ( aSig0<<1 )
4854
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4855
            return propagateFloatx80NaN( a, b STATUS_VAR );
4856
        }
4857
        goto invalid;
4858
    }
4859
    if ( bExp == 0x7FFF ) {
4860
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4861
        return a;
4862
    }
4863
    if ( bExp == 0 ) {
4864
        if ( bSig == 0 ) {
4865
 invalid:
4866
            float_raise( float_flag_invalid STATUS_VAR);
4867
            z.low = floatx80_default_nan_low;
4868
            z.high = floatx80_default_nan_high;
4869
            return z;
4870
        }
4871
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4872
    }
4873
    if ( aExp == 0 ) {
4874
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4875
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4876
    }
4877
    bSig |= LIT64( 0x8000000000000000 );
4878
    zSign = aSign;
4879
    expDiff = aExp - bExp;
4880
    aSig1 = 0;
4881
    if ( expDiff < 0 ) {
4882
        if ( expDiff < -1 ) return a;
4883
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4884
        expDiff = 0;
4885
    }
4886
    q = ( bSig <= aSig0 );
4887
    if ( q ) aSig0 -= bSig;
4888
    expDiff -= 64;
4889
    while ( 0 < expDiff ) {
4890
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4891
        q = ( 2 < q ) ? q - 2 : 0;
4892
        mul64To128( bSig, q, &term0, &term1 );
4893
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4894
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4895
        expDiff -= 62;
4896
    }
4897
    expDiff += 64;
4898
    if ( 0 < expDiff ) {
4899
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4900
        q = ( 2 < q ) ? q - 2 : 0;
4901
        q >>= 64 - expDiff;
4902
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4903
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4904
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4905
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4906
            ++q;
4907
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4908
        }
4909
    }
4910
    else {
4911
        term1 = 0;
4912
        term0 = bSig;
4913
    }
4914
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4915
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4916
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4917
              && ( q & 1 ) )
4918
       ) {
4919
        aSig0 = alternateASig0;
4920
        aSig1 = alternateASig1;
4921
        zSign = ! zSign;
4922
    }
4923
    return
4924
        normalizeRoundAndPackFloatx80(
4925
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4926

    
4927
}
4928

    
4929
/*----------------------------------------------------------------------------
4930
| Returns the square root of the extended double-precision floating-point
4931
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4932
| for Binary Floating-Point Arithmetic.
4933
*----------------------------------------------------------------------------*/
4934

    
4935
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4936
{
4937
    flag aSign;
4938
    int32 aExp, zExp;
4939
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4940
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4941
    floatx80 z;
4942

    
4943
    aSig0 = extractFloatx80Frac( a );
4944
    aExp = extractFloatx80Exp( a );
4945
    aSign = extractFloatx80Sign( a );
4946
    if ( aExp == 0x7FFF ) {
4947
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4948
        if ( ! aSign ) return a;
4949
        goto invalid;
4950
    }
4951
    if ( aSign ) {
4952
        if ( ( aExp | aSig0 ) == 0 ) return a;
4953
 invalid:
4954
        float_raise( float_flag_invalid STATUS_VAR);
4955
        z.low = floatx80_default_nan_low;
4956
        z.high = floatx80_default_nan_high;
4957
        return z;
4958
    }
4959
    if ( aExp == 0 ) {
4960
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4961
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4962
    }
4963
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4964
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4965
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4966
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967
    doubleZSig0 = zSig0<<1;
4968
    mul64To128( zSig0, zSig0, &term0, &term1 );
4969
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970
    while ( (int64_t) rem0 < 0 ) {
4971
        --zSig0;
4972
        doubleZSig0 -= 2;
4973
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4974
    }
4975
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4977
        if ( zSig1 == 0 ) zSig1 = 1;
4978
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980
        mul64To128( zSig1, zSig1, &term2, &term3 );
4981
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982
        while ( (int64_t) rem1 < 0 ) {
4983
            --zSig1;
4984
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985
            term3 |= 1;
4986
            term2 |= doubleZSig0;
4987
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988
        }
4989
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4990
    }
4991
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4992
    zSig0 |= doubleZSig0;
4993
    return
4994
        roundAndPackFloatx80(
4995
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4996

    
4997
}
4998

    
4999
/*----------------------------------------------------------------------------
5000
| Returns 1 if the extended double-precision floating-point value `a' is equal
5001
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
5002
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5003
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5004
*----------------------------------------------------------------------------*/
5005

    
5006
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5007
{
5008

    
5009
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5010
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5011
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5012
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5013
       ) {
5014
        float_raise( float_flag_invalid STATUS_VAR);
5015
        return 0;
5016
    }
5017
    return
5018
           ( a.low == b.low )
5019
        && (    ( a.high == b.high )
5020
             || (    ( a.low == 0 )
5021
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5022
           );
5023

    
5024
}
5025

    
5026
/*----------------------------------------------------------------------------
5027
| Returns 1 if the extended double-precision floating-point value `a' is
5028
| less than or equal to the corresponding value `b', and 0 otherwise.  The
5029
| invalid exception is raised if either operand is a NaN.  The comparison is
5030
| performed according to the IEC/IEEE Standard for Binary Floating-Point
5031
| Arithmetic.
5032
*----------------------------------------------------------------------------*/
5033

    
5034
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5035
{
5036
    flag aSign, bSign;
5037

    
5038
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5039
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5040
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5041
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5042
       ) {
5043
        float_raise( float_flag_invalid STATUS_VAR);
5044
        return 0;
5045
    }
5046
    aSign = extractFloatx80Sign( a );
5047
    bSign = extractFloatx80Sign( b );
5048
    if ( aSign != bSign ) {
5049
        return
5050
               aSign
5051
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5052
                 == 0 );
5053
    }
5054
    return
5055
          aSign ? le128( b.high, b.low, a.high, a.low )
5056
        : le128( a.high, a.low, b.high, b.low );
5057

    
5058
}
5059

    
5060
/*----------------------------------------------------------------------------
5061
| Returns 1 if the extended double-precision floating-point value `a' is
5062
| less than the corresponding value `b', and 0 otherwise.  The invalid
5063
| exception is raised if either operand is a NaN.  The comparison is performed
5064
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5065
*----------------------------------------------------------------------------*/
5066

    
5067
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5068
{
5069
    flag aSign, bSign;
5070

    
5071
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5072
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5073
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5074
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5075
       ) {
5076
        float_raise( float_flag_invalid STATUS_VAR);
5077
        return 0;
5078
    }
5079
    aSign = extractFloatx80Sign( a );
5080
    bSign = extractFloatx80Sign( b );
5081
    if ( aSign != bSign ) {
5082
        return
5083
               aSign
5084
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5085
                 != 0 );
5086
    }
5087
    return
5088
          aSign ? lt128( b.high, b.low, a.high, a.low )
5089
        : lt128( a.high, a.low, b.high, b.low );
5090

    
5091
}
5092

    
5093
/*----------------------------------------------------------------------------
5094
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5095
| cannot be compared, and 0 otherwise.  The invalid exception is raised if
5096
| either operand is a NaN.   The comparison is performed according to the
5097
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098
*----------------------------------------------------------------------------*/
5099
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5100
{
5101
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5102
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5103
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5104
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5105
       ) {
5106
        float_raise( float_flag_invalid STATUS_VAR);
5107
        return 1;
5108
    }
5109
    return 0;
5110
}
5111

    
5112
/*----------------------------------------------------------------------------
5113
| Returns 1 if the extended double-precision floating-point value `a' is
5114
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5115
| cause an exception.  The comparison is performed according to the IEC/IEEE
5116
| Standard for Binary Floating-Point Arithmetic.
5117
*----------------------------------------------------------------------------*/
5118

    
5119
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5120
{
5121

    
5122
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5123
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5124
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5125
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5126
       ) {
5127
        if (    floatx80_is_signaling_nan( a )
5128
             || floatx80_is_signaling_nan( b ) ) {
5129
            float_raise( float_flag_invalid STATUS_VAR);
5130
        }
5131
        return 0;
5132
    }
5133
    return
5134
           ( a.low == b.low )
5135
        && (    ( a.high == b.high )
5136
             || (    ( a.low == 0 )
5137
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5138
           );
5139

    
5140
}
5141

    
5142
/*----------------------------------------------------------------------------
5143
| Returns 1 if the extended double-precision floating-point value `a' is less
5144
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5145
| do not cause an exception.  Otherwise, the comparison is performed according
5146
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5147
*----------------------------------------------------------------------------*/
5148

    
5149
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5150
{
5151
    flag aSign, bSign;
5152

    
5153
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5154
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5155
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5156
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5157
       ) {
5158
        if (    floatx80_is_signaling_nan( a )
5159
             || floatx80_is_signaling_nan( b ) ) {
5160
            float_raise( float_flag_invalid STATUS_VAR);
5161
        }
5162
        return 0;
5163
    }
5164
    aSign = extractFloatx80Sign( a );
5165
    bSign = extractFloatx80Sign( b );
5166
    if ( aSign != bSign ) {
5167
        return
5168
               aSign
5169
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5170
                 == 0 );
5171
    }
5172
    return
5173
          aSign ? le128( b.high, b.low, a.high, a.low )
5174
        : le128( a.high, a.low, b.high, b.low );
5175

    
5176
}
5177

    
5178
/*----------------------------------------------------------------------------
5179
| Returns 1 if the extended double-precision floating-point value `a' is less
5180
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5181
| an exception.  Otherwise, the comparison is performed according to the
5182
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5183
*----------------------------------------------------------------------------*/
5184

    
5185
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5186
{
5187
    flag aSign, bSign;
5188

    
5189
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5190
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5191
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5192
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5193
       ) {
5194
        if (    floatx80_is_signaling_nan( a )
5195
             || floatx80_is_signaling_nan( b ) ) {
5196
            float_raise( float_flag_invalid STATUS_VAR);
5197
        }
5198
        return 0;
5199
    }
5200
    aSign = extractFloatx80Sign( a );
5201
    bSign = extractFloatx80Sign( b );
5202
    if ( aSign != bSign ) {
5203
        return
5204
               aSign
5205
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5206
                 != 0 );
5207
    }
5208
    return
5209
          aSign ? lt128( b.high, b.low, a.high, a.low )
5210
        : lt128( a.high, a.low, b.high, b.low );
5211

    
5212
}
5213

    
5214
/*----------------------------------------------------------------------------
5215
| Returns 1 if the extended double-precision floating-point values `a' and `b'
5216
| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5217
| The comparison is performed according to the IEC/IEEE Standard for Binary
5218
| Floating-Point Arithmetic.
5219
*----------------------------------------------------------------------------*/
5220
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5221
{
5222
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5223
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5224
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5225
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5226
       ) {
5227
        if (    floatx80_is_signaling_nan( a )
5228
             || floatx80_is_signaling_nan( b ) ) {
5229
            float_raise( float_flag_invalid STATUS_VAR);
5230
        }
5231
        return 1;
5232
    }
5233
    return 0;
5234
}
5235

    
5236
/*----------------------------------------------------------------------------
5237
| Returns the result of converting the quadruple-precision floating-point
5238
| value `a' to the 32-bit two's complement integer format.  The conversion
5239
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5240
| Arithmetic---which means in particular that the conversion is rounded
5241
| according to the current rounding mode.  If `a' is a NaN, the largest
5242
| positive integer is returned.  Otherwise, if the conversion overflows, the
5243
| largest integer with the same sign as `a' is returned.
5244
*----------------------------------------------------------------------------*/
5245

    
5246
int32 float128_to_int32( float128 a STATUS_PARAM )
5247
{
5248
    flag aSign;
5249
    int32 aExp, shiftCount;
5250
    uint64_t aSig0, aSig1;
5251

    
5252
    aSig1 = extractFloat128Frac1( a );
5253
    aSig0 = extractFloat128Frac0( a );
5254
    aExp = extractFloat128Exp( a );
5255
    aSign = extractFloat128Sign( a );
5256
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5257
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5258
    aSig0 |= ( aSig1 != 0 );
5259
    shiftCount = 0x4028 - aExp;
5260
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5261
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5262

    
5263
}
5264

    
5265
/*----------------------------------------------------------------------------
5266
| Returns the result of converting the quadruple-precision floating-point
5267
| value `a' to the 32-bit two's complement integer format.  The conversion
5268
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5269
| Arithmetic, except that the conversion is always rounded toward zero.  If
5270
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5271
| conversion overflows, the largest integer with the same sign as `a' is
5272
| returned.
5273
*----------------------------------------------------------------------------*/
5274

    
5275
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5276
{
5277
    flag aSign;
5278
    int32 aExp, shiftCount;
5279
    uint64_t aSig0, aSig1, savedASig;
5280
    int32_t z;
5281

    
5282
    aSig1 = extractFloat128Frac1( a );
5283
    aSig0 = extractFloat128Frac0( a );
5284
    aExp = extractFloat128Exp( a );
5285
    aSign = extractFloat128Sign( a );
5286
    aSig0 |= ( aSig1 != 0 );
5287
    if ( 0x401E < aExp ) {
5288
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5289
        goto invalid;
5290
    }
5291
    else if ( aExp < 0x3FFF ) {
5292
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5293
        return 0;
5294
    }
5295
    aSig0 |= LIT64( 0x0001000000000000 );
5296
    shiftCount = 0x402F - aExp;
5297
    savedASig = aSig0;
5298
    aSig0 >>= shiftCount;
5299
    z = aSig0;
5300
    if ( aSign ) z = - z;
5301
    if ( ( z < 0 ) ^ aSign ) {
5302
 invalid:
5303
        float_raise( float_flag_invalid STATUS_VAR);
5304
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5305
    }
5306
    if ( ( aSig0<<shiftCount ) != savedASig ) {
5307
        STATUS(float_exception_flags) |= float_flag_inexact;
5308
    }
5309
    return z;
5310

    
5311
}
5312

    
5313
/*----------------------------------------------------------------------------
5314
| Returns the result of converting the quadruple-precision floating-point
5315
| value `a' to the 64-bit two's complement integer format.  The conversion
5316
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5317
| Arithmetic---which means in particular that the conversion is rounded
5318
| according to the current rounding mode.  If `a' is a NaN, the largest
5319
| positive integer is returned.  Otherwise, if the conversion overflows, the
5320
| largest integer with the same sign as `a' is returned.
5321
*----------------------------------------------------------------------------*/
5322

    
5323
int64 float128_to_int64( float128 a STATUS_PARAM )
5324
{
5325
    flag aSign;
5326
    int32 aExp, shiftCount;
5327
    uint64_t aSig0, aSig1;
5328

    
5329
    aSig1 = extractFloat128Frac1( a );
5330
    aSig0 = extractFloat128Frac0( a );
5331
    aExp = extractFloat128Exp( a );
5332
    aSign = extractFloat128Sign( a );
5333
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5334
    shiftCount = 0x402F - aExp;
5335
    if ( shiftCount <= 0 ) {
5336
        if ( 0x403E < aExp ) {
5337
            float_raise( float_flag_invalid STATUS_VAR);
5338
            if (    ! aSign
5339
                 || (    ( aExp == 0x7FFF )
5340
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5341
                    )
5342
               ) {
5343
                return LIT64( 0x7FFFFFFFFFFFFFFF );
5344
            }
5345
            return (int64_t) LIT64( 0x8000000000000000 );
5346
        }
5347
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5348
    }
5349
    else {
5350
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5351
    }
5352
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5353

    
5354
}
5355

    
5356
/*----------------------------------------------------------------------------
5357
| Returns the result of converting the quadruple-precision floating-point
5358
| value `a' to the 64-bit two's complement integer format.  The conversion
5359
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5360
| Arithmetic, except that the conversion is always rounded toward zero.
5361
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5362
| the conversion overflows, the largest integer with the same sign as `a' is
5363
| returned.
5364
*----------------------------------------------------------------------------*/
5365

    
5366
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5367
{
5368
    flag aSign;
5369
    int32 aExp, shiftCount;
5370
    uint64_t aSig0, aSig1;
5371
    int64 z;
5372

    
5373
    aSig1 = extractFloat128Frac1( a );
5374
    aSig0 = extractFloat128Frac0( a );
5375
    aExp = extractFloat128Exp( a );
5376
    aSign = extractFloat128Sign( a );
5377
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5378
    shiftCount = aExp - 0x402F;
5379
    if ( 0 < shiftCount ) {
5380
        if ( 0x403E <= aExp ) {
5381
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5382
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5383
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5384
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5385
            }
5386
            else {
5387
                float_raise( float_flag_invalid STATUS_VAR);
5388
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5389
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
5390
                }
5391
            }
5392
            return (int64_t) LIT64( 0x8000000000000000 );
5393
        }
5394
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5395
        if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5396
            STATUS(float_exception_flags) |= float_flag_inexact;
5397
        }
5398
    }
5399
    else {
5400
        if ( aExp < 0x3FFF ) {
5401
            if ( aExp | aSig0 | aSig1 ) {
5402
                STATUS(float_exception_flags) |= float_flag_inexact;
5403
            }
5404
            return 0;
5405
        }
5406
        z = aSig0>>( - shiftCount );
5407
        if (    aSig1
5408
             || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5409
            STATUS(float_exception_flags) |= float_flag_inexact;
5410
        }
5411
    }
5412
    if ( aSign ) z = - z;
5413
    return z;
5414

    
5415
}
5416

    
5417
/*----------------------------------------------------------------------------
5418
| Returns the result of converting the quadruple-precision floating-point
5419
| value `a' to the single-precision floating-point format.  The conversion
5420
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5421
| Arithmetic.
5422
*----------------------------------------------------------------------------*/
5423

    
5424
float32 float128_to_float32( float128 a STATUS_PARAM )
5425
{
5426
    flag aSign;
5427
    int32 aExp;
5428
    uint64_t aSig0, aSig1;
5429
    uint32_t zSig;
5430

    
5431
    aSig1 = extractFloat128Frac1( a );
5432
    aSig0 = extractFloat128Frac0( a );
5433
    aExp = extractFloat128Exp( a );
5434
    aSign = extractFloat128Sign( a );
5435
    if ( aExp == 0x7FFF ) {
5436
        if ( aSig0 | aSig1 ) {
5437
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5438
        }
5439
        return packFloat32( aSign, 0xFF, 0 );
5440
    }
5441
    aSig0 |= ( aSig1 != 0 );
5442
    shift64RightJamming( aSig0, 18, &aSig0 );
5443
    zSig = aSig0;
5444
    if ( aExp || zSig ) {
5445
        zSig |= 0x40000000;
5446
        aExp -= 0x3F81;
5447
    }
5448
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5449

    
5450
}
5451

    
5452
/*----------------------------------------------------------------------------
5453
| Returns the result of converting the quadruple-precision floating-point
5454
| value `a' to the double-precision floating-point format.  The conversion
5455
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5456
| Arithmetic.
5457
*----------------------------------------------------------------------------*/
5458

    
5459
float64 float128_to_float64( float128 a STATUS_PARAM )
5460
{
5461
    flag aSign;
5462
    int32 aExp;
5463
    uint64_t aSig0, aSig1;
5464

    
5465
    aSig1 = extractFloat128Frac1( a );
5466
    aSig0 = extractFloat128Frac0( a );
5467
    aExp = extractFloat128Exp( a );
5468
    aSign = extractFloat128Sign( a );
5469
    if ( aExp == 0x7FFF ) {
5470
        if ( aSig0 | aSig1 ) {
5471
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5472
        }
5473
        return packFloat64( aSign, 0x7FF, 0 );
5474
    }
5475
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5476
    aSig0 |= ( aSig1 != 0 );
5477
    if ( aExp || aSig0 ) {
5478
        aSig0 |= LIT64( 0x4000000000000000 );
5479
        aExp -= 0x3C01;
5480
    }
5481
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5482

    
5483
}
5484

    
5485
/*----------------------------------------------------------------------------
5486
| Returns the result of converting the quadruple-precision floating-point
5487
| value `a' to the extended double-precision floating-point format.  The
5488
| conversion is performed according to the IEC/IEEE Standard for Binary
5489
| Floating-Point Arithmetic.
5490
*----------------------------------------------------------------------------*/
5491

    
5492
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5493
{
5494
    flag aSign;
5495
    int32 aExp;
5496
    uint64_t aSig0, aSig1;
5497

    
5498
    aSig1 = extractFloat128Frac1( a );
5499
    aSig0 = extractFloat128Frac0( a );
5500
    aExp = extractFloat128Exp( a );
5501
    aSign = extractFloat128Sign( a );
5502
    if ( aExp == 0x7FFF ) {
5503
        if ( aSig0 | aSig1 ) {
5504
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5505
        }
5506
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5507
    }
5508
    if ( aExp == 0 ) {
5509
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5510
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5511
    }
5512
    else {
5513
        aSig0 |= LIT64( 0x0001000000000000 );
5514
    }
5515
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5516
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5517

    
5518
}
5519

    
5520
/*----------------------------------------------------------------------------
5521
| Rounds the quadruple-precision floating-point value `a' to an integer, and
5522
| returns the result as a quadruple-precision floating-point value.  The
5523
| operation is performed according to the IEC/IEEE Standard for Binary
5524
| Floating-Point Arithmetic.
5525
*----------------------------------------------------------------------------*/
5526

    
5527
float128 float128_round_to_int( float128 a STATUS_PARAM )
5528
{
5529
    flag aSign;
5530
    int32 aExp;
5531
    uint64_t lastBitMask, roundBitsMask;
5532
    int8 roundingMode;
5533
    float128 z;
5534

    
5535
    aExp = extractFloat128Exp( a );
5536
    if ( 0x402F <= aExp ) {
5537
        if ( 0x406F <= aExp ) {
5538
            if (    ( aExp == 0x7FFF )
5539
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5540
               ) {
5541
                return propagateFloat128NaN( a, a STATUS_VAR );
5542
            }
5543
            return a;
5544
        }
5545
        lastBitMask = 1;
5546
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5547
        roundBitsMask = lastBitMask - 1;
5548
        z = a;
5549
        roundingMode = STATUS(float_rounding_mode);
5550
        if ( roundingMode == float_round_nearest_even ) {
5551
            if ( lastBitMask ) {
5552
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5553
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5554
            }
5555
            else {
5556
                if ( (int64_t) z.low < 0 ) {
5557
                    ++z.high;
5558
                    if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5559
                }
5560
            }
5561
        }
5562
        else if ( roundingMode != float_round_to_zero ) {
5563
            if (   extractFloat128Sign( z )
5564
                 ^ ( roundingMode == float_round_up ) ) {
5565
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5566
            }
5567
        }
5568
        z.low &= ~ roundBitsMask;
5569
    }
5570
    else {
5571
        if ( aExp < 0x3FFF ) {
5572
            if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5573
            STATUS(float_exception_flags) |= float_flag_inexact;
5574
            aSign = extractFloat128Sign( a );
5575
            switch ( STATUS(float_rounding_mode) ) {
5576
             case float_round_nearest_even:
5577
                if (    ( aExp == 0x3FFE )
5578
                     && (   extractFloat128Frac0( a )
5579
                          | extractFloat128Frac1( a ) )
5580
                   ) {
5581
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5582
                }
5583
                break;
5584
             case float_round_down:
5585
                return
5586
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5587
                    : packFloat128( 0, 0, 0, 0 );
5588
             case float_round_up:
5589
                return
5590
                      aSign ? packFloat128( 1, 0, 0, 0 )
5591
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5592
            }
5593
            return packFloat128( aSign, 0, 0, 0 );
5594
        }
5595
        lastBitMask = 1;
5596
        lastBitMask <<= 0x402F - aExp;
5597
        roundBitsMask = lastBitMask - 1;
5598
        z.low = 0;
5599
        z.high = a.high;
5600
        roundingMode = STATUS(float_rounding_mode);
5601
        if ( roundingMode == float_round_nearest_even ) {
5602
            z.high += lastBitMask>>1;
5603
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5604
                z.high &= ~ lastBitMask;
5605
            }
5606
        }
5607
        else if ( roundingMode != float_round_to_zero ) {
5608
            if (   extractFloat128Sign( z )
5609
                 ^ ( roundingMode == float_round_up ) ) {
5610
                z.high |= ( a.low != 0 );
5611
                z.high += roundBitsMask;
5612
            }
5613
        }
5614
        z.high &= ~ roundBitsMask;
5615
    }
5616
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5617
        STATUS(float_exception_flags) |= float_flag_inexact;
5618
    }
5619
    return z;
5620

    
5621
}
5622

    
5623
/*----------------------------------------------------------------------------
5624
| Returns the result of adding the absolute values of the quadruple-precision
5625
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5626
| before being returned.  `zSign' is ignored if the result is a NaN.
5627
| The addition is performed according to the IEC/IEEE Standard for Binary
5628
| Floating-Point Arithmetic.
5629
*----------------------------------------------------------------------------*/
5630

    
5631
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5632
{
5633
    int32 aExp, bExp, zExp;
5634
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5635
    int32 expDiff;
5636

    
5637
    aSig1 = extractFloat128Frac1( a );
5638
    aSig0 = extractFloat128Frac0( a );
5639
    aExp = extractFloat128Exp( a );
5640
    bSig1 = extractFloat128Frac1( b );
5641
    bSig0 = extractFloat128Frac0( b );
5642
    bExp = extractFloat128Exp( b );
5643
    expDiff = aExp - bExp;
5644
    if ( 0 < expDiff ) {
5645
        if ( aExp == 0x7FFF ) {
5646
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5647
            return a;
5648
        }
5649
        if ( bExp == 0 ) {
5650
            --expDiff;
5651
        }
5652
        else {
5653
            bSig0 |= LIT64( 0x0001000000000000 );
5654
        }
5655
        shift128ExtraRightJamming(
5656
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5657
        zExp = aExp;
5658
    }
5659
    else if ( expDiff < 0 ) {
5660
        if ( bExp == 0x7FFF ) {
5661
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5662
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5663
        }
5664
        if ( aExp == 0 ) {
5665
            ++expDiff;
5666
        }
5667
        else {
5668
            aSig0 |= LIT64( 0x0001000000000000 );
5669
        }
5670
        shift128ExtraRightJamming(
5671
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5672
        zExp = bExp;
5673
    }
5674
    else {
5675
        if ( aExp == 0x7FFF ) {
5676
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5677
                return propagateFloat128NaN( a, b STATUS_VAR );
5678
            }
5679
            return a;
5680
        }
5681
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5682
        if ( aExp == 0 ) {
5683
            if (STATUS(flush_to_zero)) {
5684
                if (zSig0 | zSig1) {
5685
                    float_raise(float_flag_output_denormal STATUS_VAR);
5686
                }
5687
                return packFloat128(zSign, 0, 0, 0);
5688
            }
5689
            return packFloat128( zSign, 0, zSig0, zSig1 );
5690
        }
5691
        zSig2 = 0;
5692
        zSig0 |= LIT64( 0x0002000000000000 );
5693
        zExp = aExp;
5694
        goto shiftRight1;
5695
    }
5696
    aSig0 |= LIT64( 0x0001000000000000 );
5697
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5698
    --zExp;
5699
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5700
    ++zExp;
5701
 shiftRight1:
5702
    shift128ExtraRightJamming(
5703
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5704
 roundAndPack:
5705
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5706

    
5707
}
5708

    
5709
/*----------------------------------------------------------------------------
5710
| Returns the result of subtracting the absolute values of the quadruple-
5711
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5712
| difference is negated before being returned.  `zSign' is ignored if the
5713
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5714
| Standard for Binary Floating-Point Arithmetic.
5715
*----------------------------------------------------------------------------*/
5716

    
5717
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5718
{
5719
    int32 aExp, bExp, zExp;
5720
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5721
    int32 expDiff;
5722
    float128 z;
5723

    
5724
    aSig1 = extractFloat128Frac1( a );
5725
    aSig0 = extractFloat128Frac0( a );
5726
    aExp = extractFloat128Exp( a );
5727
    bSig1 = extractFloat128Frac1( b );
5728
    bSig0 = extractFloat128Frac0( b );
5729
    bExp = extractFloat128Exp( b );
5730
    expDiff = aExp - bExp;
5731
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5732
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5733
    if ( 0 < expDiff ) goto aExpBigger;
5734
    if ( expDiff < 0 ) goto bExpBigger;
5735
    if ( aExp == 0x7FFF ) {
5736
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5737
            return propagateFloat128NaN( a, b STATUS_VAR );
5738
        }
5739
        float_raise( float_flag_invalid STATUS_VAR);
5740
        z.low = float128_default_nan_low;
5741
        z.high = float128_default_nan_high;
5742
        return z;
5743
    }
5744
    if ( aExp == 0 ) {
5745
        aExp = 1;
5746
        bExp = 1;
5747
    }
5748
    if ( bSig0 < aSig0 ) goto aBigger;
5749
    if ( aSig0 < bSig0 ) goto bBigger;
5750
    if ( bSig1 < aSig1 ) goto aBigger;
5751
    if ( aSig1 < bSig1 ) goto bBigger;
5752
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5753
 bExpBigger:
5754
    if ( bExp == 0x7FFF ) {
5755
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5756
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5757
    }
5758
    if ( aExp == 0 ) {
5759
        ++expDiff;
5760
    }
5761
    else {
5762
        aSig0 |= LIT64( 0x4000000000000000 );
5763
    }
5764
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5765
    bSig0 |= LIT64( 0x4000000000000000 );
5766
 bBigger:
5767
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5768
    zExp = bExp;
5769
    zSign ^= 1;
5770
    goto normalizeRoundAndPack;
5771
 aExpBigger:
5772
    if ( aExp == 0x7FFF ) {
5773
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5774
        return a;
5775
    }
5776
    if ( bExp == 0 ) {
5777
        --expDiff;
5778
    }
5779
    else {
5780
        bSig0 |= LIT64( 0x4000000000000000 );
5781
    }
5782
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5783
    aSig0 |= LIT64( 0x4000000000000000 );
5784
 aBigger:
5785
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5786
    zExp = aExp;
5787
 normalizeRoundAndPack:
5788
    --zExp;
5789
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5790

    
5791
}
5792

    
5793
/*----------------------------------------------------------------------------
5794
| Returns the result of adding the quadruple-precision floating-point values
5795
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5796
| for Binary Floating-Point Arithmetic.
5797
*----------------------------------------------------------------------------*/
5798

    
5799
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5800
{
5801
    flag aSign, bSign;
5802

    
5803
    aSign = extractFloat128Sign( a );
5804
    bSign = extractFloat128Sign( b );
5805
    if ( aSign == bSign ) {
5806
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5807
    }
5808
    else {
5809
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5810
    }
5811

    
5812
}
5813

    
5814
/*----------------------------------------------------------------------------
5815
| Returns the result of subtracting the quadruple-precision floating-point
5816
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5817
| Standard for Binary Floating-Point Arithmetic.
5818
*----------------------------------------------------------------------------*/
5819

    
5820
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5821
{
5822
    flag aSign, bSign;
5823

    
5824
    aSign = extractFloat128Sign( a );
5825
    bSign = extractFloat128Sign( b );
5826
    if ( aSign == bSign ) {
5827
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5828
    }
5829
    else {
5830
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5831
    }
5832

    
5833
}
5834

    
5835
/*----------------------------------------------------------------------------
5836
| Returns the result of multiplying the quadruple-precision floating-point
5837
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5838
| Standard for Binary Floating-Point Arithmetic.
5839
*----------------------------------------------------------------------------*/
5840

    
5841
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5842
{
5843
    flag aSign, bSign, zSign;
5844
    int32 aExp, bExp, zExp;
5845
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5846
    float128 z;
5847

    
5848
    aSig1 = extractFloat128Frac1( a );
5849
    aSig0 = extractFloat128Frac0( a );
5850
    aExp = extractFloat128Exp( a );
5851
    aSign = extractFloat128Sign( a );
5852
    bSig1 = extractFloat128Frac1( b );
5853
    bSig0 = extractFloat128Frac0( b );
5854
    bExp = extractFloat128Exp( b );
5855
    bSign = extractFloat128Sign( b );
5856
    zSign = aSign ^ bSign;
5857
    if ( aExp == 0x7FFF ) {
5858
        if (    ( aSig0 | aSig1 )
5859
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5860
            return propagateFloat128NaN( a, b STATUS_VAR );
5861
        }
5862
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5863
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5864
    }
5865
    if ( bExp == 0x7FFF ) {
5866
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5867
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5868
 invalid:
5869
            float_raise( float_flag_invalid STATUS_VAR);
5870
            z.low = float128_default_nan_low;
5871
            z.high = float128_default_nan_high;
5872
            return z;
5873
        }
5874
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5875
    }
5876
    if ( aExp == 0 ) {
5877
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5878
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5879
    }
5880
    if ( bExp == 0 ) {
5881
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5882
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5883
    }
5884
    zExp = aExp + bExp - 0x4000;
5885
    aSig0 |= LIT64( 0x0001000000000000 );
5886
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5887
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5888
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5889
    zSig2 |= ( zSig3 != 0 );
5890
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5891
        shift128ExtraRightJamming(
5892
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5893
        ++zExp;
5894
    }
5895
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5896

    
5897
}
5898

    
5899
/*----------------------------------------------------------------------------
5900
| Returns the result of dividing the quadruple-precision floating-point value
5901
| `a' by the corresponding value `b'.  The operation is performed according to
5902
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5903
*----------------------------------------------------------------------------*/
5904

    
5905
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5906
{
5907
    flag aSign, bSign, zSign;
5908
    int32 aExp, bExp, zExp;
5909
    uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5910
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5911
    float128 z;
5912

    
5913
    aSig1 = extractFloat128Frac1( a );
5914
    aSig0 = extractFloat128Frac0( a );
5915
    aExp = extractFloat128Exp( a );
5916
    aSign = extractFloat128Sign( a );
5917
    bSig1 = extractFloat128Frac1( b );
5918
    bSig0 = extractFloat128Frac0( b );
5919
    bExp = extractFloat128Exp( b );
5920
    bSign = extractFloat128Sign( b );
5921
    zSign = aSign ^ bSign;
5922
    if ( aExp == 0x7FFF ) {
5923
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5924
        if ( bExp == 0x7FFF ) {
5925
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5926
            goto invalid;
5927
        }
5928
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5929
    }
5930
    if ( bExp == 0x7FFF ) {
5931
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5932
        return packFloat128( zSign, 0, 0, 0 );
5933
    }
5934
    if ( bExp == 0 ) {
5935
        if ( ( bSig0 | bSig1 ) == 0 ) {
5936
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5937
 invalid:
5938
                float_raise( float_flag_invalid STATUS_VAR);
5939
                z.low = float128_default_nan_low;
5940
                z.high = float128_default_nan_high;
5941
                return z;
5942
            }
5943
            float_raise( float_flag_divbyzero STATUS_VAR);
5944
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5945
        }
5946
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5947
    }
5948
    if ( aExp == 0 ) {
5949
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5950
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5951
    }
5952
    zExp = aExp - bExp + 0x3FFD;
5953
    shortShift128Left(
5954
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5955
    shortShift128Left(
5956
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5957
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5958
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5959
        ++zExp;
5960
    }
5961
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5962
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5963
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5964
    while ( (int64_t) rem0 < 0 ) {
5965
        --zSig0;
5966
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5967
    }
5968
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5969
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5970
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5971
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5972
        while ( (int64_t) rem1 < 0 ) {
5973
            --zSig1;
5974
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5975
        }
5976
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5977
    }
5978
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5979
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5980

    
5981
}
5982

    
5983
/*----------------------------------------------------------------------------
5984
| Returns the remainder of the quadruple-precision floating-point value `a'
5985
| with respect to the corresponding value `b'.  The operation is performed
5986
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5987
*----------------------------------------------------------------------------*/
5988

    
5989
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5990
{
5991
    flag aSign, zSign;
5992
    int32 aExp, bExp, expDiff;
5993
    uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5994
    uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5995
    int64_t sigMean0;
5996
    float128 z;
5997

    
5998
    aSig1 = extractFloat128Frac1( a );
5999
    aSig0 = extractFloat128Frac0( a );
6000
    aExp = extractFloat128Exp( a );
6001
    aSign = extractFloat128Sign( a );
6002
    bSig1 = extractFloat128Frac1( b );
6003
    bSig0 = extractFloat128Frac0( b );
6004
    bExp = extractFloat128Exp( b );
6005
    if ( aExp == 0x7FFF ) {
6006
        if (    ( aSig0 | aSig1 )
6007
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6008
            return propagateFloat128NaN( a, b STATUS_VAR );
6009
        }
6010
        goto invalid;
6011
    }
6012
    if ( bExp == 0x7FFF ) {
6013
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6014
        return a;
6015
    }
6016
    if ( bExp == 0 ) {
6017
        if ( ( bSig0 | bSig1 ) == 0 ) {
6018
 invalid:
6019
            float_raise( float_flag_invalid STATUS_VAR);
6020
            z.low = float128_default_nan_low;
6021
            z.high = float128_default_nan_high;
6022
            return z;
6023
        }
6024
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6025
    }
6026
    if ( aExp == 0 ) {
6027
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
6028
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6029
    }
6030
    expDiff = aExp - bExp;
6031
    if ( expDiff < -1 ) return a;
6032
    shortShift128Left(
6033
        aSig0 | LIT64( 0x0001000000000000 ),
6034
        aSig1,
6035
        15 - ( expDiff < 0 ),
6036
        &aSig0,
6037
        &aSig1
6038
    );
6039
    shortShift128Left(
6040
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6041
    q = le128( bSig0, bSig1, aSig0, aSig1 );
6042
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6043
    expDiff -= 64;
6044
    while ( 0 < expDiff ) {
6045
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6046
        q = ( 4 < q ) ? q - 4 : 0;
6047
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6048
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6049
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6050
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6051
        expDiff -= 61;
6052
    }
6053
    if ( -64 < expDiff ) {
6054
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6055
        q = ( 4 < q ) ? q - 4 : 0;
6056
        q >>= - expDiff;
6057
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6058
        expDiff += 52;
6059
        if ( expDiff < 0 ) {
6060
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6061
        }
6062
        else {
6063
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6064
        }
6065
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6066
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6067
    }
6068
    else {
6069
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6070
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6071
    }
6072
    do {
6073
        alternateASig0 = aSig0;
6074
        alternateASig1 = aSig1;
6075
        ++q;
6076
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6077
    } while ( 0 <= (int64_t) aSig0 );
6078
    add128(
6079
        aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6080
    if (    ( sigMean0 < 0 )
6081
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6082
        aSig0 = alternateASig0;
6083
        aSig1 = alternateASig1;
6084
    }
6085
    zSign = ( (int64_t) aSig0 < 0 );
6086
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6087
    return
6088
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6089

    
6090
}
6091

    
6092
/*----------------------------------------------------------------------------
6093
| Returns the square root of the quadruple-precision floating-point value `a'.
6094
| The operation is performed according to the IEC/IEEE Standard for Binary
6095
| Floating-Point Arithmetic.
6096
*----------------------------------------------------------------------------*/
6097

    
6098
float128 float128_sqrt( float128 a STATUS_PARAM )
6099
{
6100
    flag aSign;
6101
    int32 aExp, zExp;
6102
    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6103
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6104
    float128 z;
6105

    
6106
    aSig1 = extractFloat128Frac1( a );
6107
    aSig0 = extractFloat128Frac0( a );
6108
    aExp = extractFloat128Exp( a );
6109
    aSign = extractFloat128Sign( a );
6110
    if ( aExp == 0x7FFF ) {
6111
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6112
        if ( ! aSign ) return a;
6113
        goto invalid;
6114
    }
6115
    if ( aSign ) {
6116
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6117
 invalid:
6118
        float_raise( float_flag_invalid STATUS_VAR);
6119
        z.low = float128_default_nan_low;
6120
        z.high = float128_default_nan_high;
6121
        return z;
6122
    }
6123
    if ( aExp == 0 ) {
6124
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6125
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6126
    }
6127
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6128
    aSig0 |= LIT64( 0x0001000000000000 );
6129
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6130
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6131
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6132
    doubleZSig0 = zSig0<<1;
6133
    mul64To128( zSig0, zSig0, &term0, &term1 );
6134
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6135
    while ( (int64_t) rem0 < 0 ) {
6136
        --zSig0;
6137
        doubleZSig0 -= 2;
6138
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6139
    }
6140
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6141
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6142
        if ( zSig1 == 0 ) zSig1 = 1;
6143
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6144
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6145
        mul64To128( zSig1, zSig1, &term2, &term3 );
6146
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6147
        while ( (int64_t) rem1 < 0 ) {
6148
            --zSig1;
6149
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6150
            term3 |= 1;
6151
            term2 |= doubleZSig0;
6152
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6153
        }
6154
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6155
    }
6156
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6157
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6158

    
6159
}
6160

    
6161
/*----------------------------------------------------------------------------
6162
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6163
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6164
| raised if either operand is a NaN.  Otherwise, the comparison is performed
6165
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6166
*----------------------------------------------------------------------------*/
6167

    
6168
int float128_eq( float128 a, float128 b STATUS_PARAM )
6169
{
6170

    
6171
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6172
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6173
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6174
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6175
       ) {
6176
        float_raise( float_flag_invalid STATUS_VAR);
6177
        return 0;
6178
    }
6179
    return
6180
           ( a.low == b.low )
6181
        && (    ( a.high == b.high )
6182
             || (    ( a.low == 0 )
6183
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6184
           );
6185

    
6186
}
6187

    
6188
/*----------------------------------------------------------------------------
6189
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6190
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
6191
| exception is raised if either operand is a NaN.  The comparison is performed
6192
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6193
*----------------------------------------------------------------------------*/
6194

    
6195
int float128_le( float128 a, float128 b STATUS_PARAM )
6196
{
6197
    flag aSign, bSign;
6198

    
6199
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6200
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6201
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6202
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6203
       ) {
6204
        float_raise( float_flag_invalid STATUS_VAR);
6205
        return 0;
6206
    }
6207
    aSign = extractFloat128Sign( a );
6208
    bSign = extractFloat128Sign( b );
6209
    if ( aSign != bSign ) {
6210
        return
6211
               aSign
6212
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6213
                 == 0 );
6214
    }
6215
    return
6216
          aSign ? le128( b.high, b.low, a.high, a.low )
6217
        : le128( a.high, a.low, b.high, b.low );
6218

    
6219
}
6220

    
6221
/*----------------------------------------------------------------------------
6222
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6223
| the corresponding value `b', and 0 otherwise.  The invalid exception is
6224
| raised if either operand is a NaN.  The comparison is performed according
6225
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6226
*----------------------------------------------------------------------------*/
6227

    
6228
int float128_lt( float128 a, float128 b STATUS_PARAM )
6229
{
6230
    flag aSign, bSign;
6231

    
6232
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6233
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6234
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6235
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6236
       ) {
6237
        float_raise( float_flag_invalid STATUS_VAR);
6238
        return 0;
6239
    }
6240
    aSign = extractFloat128Sign( a );
6241
    bSign = extractFloat128Sign( b );
6242
    if ( aSign != bSign ) {
6243
        return
6244
               aSign
6245
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6246
                 != 0 );
6247
    }
6248
    return
6249
          aSign ? lt128( b.high, b.low, a.high, a.low )
6250
        : lt128( a.high, a.low, b.high, b.low );
6251

    
6252
}
6253

    
6254
/*----------------------------------------------------------------------------
6255
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6256
| be compared, and 0 otherwise.  The invalid exception is raised if either
6257
| operand is a NaN. The comparison is performed according to the IEC/IEEE
6258
| Standard for Binary Floating-Point Arithmetic.
6259
*----------------------------------------------------------------------------*/
6260

    
6261
int float128_unordered( float128 a, float128 b STATUS_PARAM )
6262
{
6263
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6264
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6265
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6266
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6267
       ) {
6268
        float_raise( float_flag_invalid STATUS_VAR);
6269
        return 1;
6270
    }
6271
    return 0;
6272
}
6273

    
6274
/*----------------------------------------------------------------------------
6275
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
6276
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6277
| exception.  The comparison is performed according to the IEC/IEEE Standard
6278
| for Binary Floating-Point Arithmetic.
6279
*----------------------------------------------------------------------------*/
6280

    
6281
int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6282
{
6283

    
6284
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6285
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6286
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6287
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6288
       ) {
6289
        if (    float128_is_signaling_nan( a )
6290
             || float128_is_signaling_nan( b ) ) {
6291
            float_raise( float_flag_invalid STATUS_VAR);
6292
        }
6293
        return 0;
6294
    }
6295
    return
6296
           ( a.low == b.low )
6297
        && (    ( a.high == b.high )
6298
             || (    ( a.low == 0 )
6299
                  && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6300
           );
6301

    
6302
}
6303

    
6304
/*----------------------------------------------------------------------------
6305
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6306
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6307
| cause an exception.  Otherwise, the comparison is performed according to the
6308
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6309
*----------------------------------------------------------------------------*/
6310

    
6311
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6312
{
6313
    flag aSign, bSign;
6314

    
6315
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6316
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6317
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6318
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6319
       ) {
6320
        if (    float128_is_signaling_nan( a )
6321
             || float128_is_signaling_nan( b ) ) {
6322
            float_raise( float_flag_invalid STATUS_VAR);
6323
        }
6324
        return 0;
6325
    }
6326
    aSign = extractFloat128Sign( a );
6327
    bSign = extractFloat128Sign( b );
6328
    if ( aSign != bSign ) {
6329
        return
6330
               aSign
6331
            || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6332
                 == 0 );
6333
    }
6334
    return
6335
          aSign ? le128( b.high, b.low, a.high, a.low )
6336
        : le128( a.high, a.low, b.high, b.low );
6337

    
6338
}
6339

    
6340
/*----------------------------------------------------------------------------
6341
| Returns 1 if the quadruple-precision floating-point value `a' is less than
6342
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6343
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6344
| Standard for Binary Floating-Point Arithmetic.
6345
*----------------------------------------------------------------------------*/
6346

    
6347
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6348
{
6349
    flag aSign, bSign;
6350

    
6351
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6352
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6353
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6354
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6355
       ) {
6356
        if (    float128_is_signaling_nan( a )
6357
             || float128_is_signaling_nan( b ) ) {
6358
            float_raise( float_flag_invalid STATUS_VAR);
6359
        }
6360
        return 0;
6361
    }
6362
    aSign = extractFloat128Sign( a );
6363
    bSign = extractFloat128Sign( b );
6364
    if ( aSign != bSign ) {
6365
        return
6366
               aSign
6367
            && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6368
                 != 0 );
6369
    }
6370
    return
6371
          aSign ? lt128( b.high, b.low, a.high, a.low )
6372
        : lt128( a.high, a.low, b.high, b.low );
6373

    
6374
}
6375

    
6376
/*----------------------------------------------------------------------------
6377
| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6378
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6379
| comparison is performed according to the IEC/IEEE Standard for Binary
6380
| Floating-Point Arithmetic.
6381
*----------------------------------------------------------------------------*/
6382

    
6383
int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6384
{
6385
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6386
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6387
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
6388
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6389
       ) {
6390
        if (    float128_is_signaling_nan( a )
6391
             || float128_is_signaling_nan( b ) ) {
6392
            float_raise( float_flag_invalid STATUS_VAR);
6393
        }
6394
        return 1;
6395
    }
6396
    return 0;
6397
}
6398

    
6399
/* misc functions */
6400
float32 uint32_to_float32( uint32 a STATUS_PARAM )
6401
{
6402
    return int64_to_float32(a STATUS_VAR);
6403
}
6404

    
6405
float64 uint32_to_float64( uint32 a STATUS_PARAM )
6406
{
6407
    return int64_to_float64(a STATUS_VAR);
6408
}
6409

    
6410
uint32 float32_to_uint32( float32 a STATUS_PARAM )
6411
{
6412
    int64_t v;
6413
    uint32 res;
6414

    
6415
    v = float32_to_int64(a STATUS_VAR);
6416
    if (v < 0) {
6417
        res = 0;
6418
        float_raise( float_flag_invalid STATUS_VAR);
6419
    } else if (v > 0xffffffff) {
6420
        res = 0xffffffff;
6421
        float_raise( float_flag_invalid STATUS_VAR);
6422
    } else {
6423
        res = v;
6424
    }
6425
    return res;
6426
}
6427

    
6428
uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6429
{
6430
    int64_t v;
6431
    uint32 res;
6432

    
6433
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6434
    if (v < 0) {
6435
        res = 0;
6436
        float_raise( float_flag_invalid STATUS_VAR);
6437
    } else if (v > 0xffffffff) {
6438
        res = 0xffffffff;
6439
        float_raise( float_flag_invalid STATUS_VAR);
6440
    } else {
6441
        res = v;
6442
    }
6443
    return res;
6444
}
6445

    
6446
uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6447
{
6448
    int64_t v;
6449
    uint_fast16_t res;
6450

    
6451
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
6452
    if (v < 0) {
6453
        res = 0;
6454
        float_raise( float_flag_invalid STATUS_VAR);
6455
    } else if (v > 0xffff) {
6456
        res = 0xffff;
6457
        float_raise( float_flag_invalid STATUS_VAR);
6458
    } else {
6459
        res = v;
6460
    }
6461
    return res;
6462
}
6463

    
6464
uint32 float64_to_uint32( float64 a STATUS_PARAM )
6465
{
6466
    int64_t v;
6467
    uint32 res;
6468

    
6469
    v = float64_to_int64(a STATUS_VAR);
6470
    if (v < 0) {
6471
        res = 0;
6472
        float_raise( float_flag_invalid STATUS_VAR);
6473
    } else if (v > 0xffffffff) {
6474
        res = 0xffffffff;
6475
        float_raise( float_flag_invalid STATUS_VAR);
6476
    } else {
6477
        res = v;
6478
    }
6479
    return res;
6480
}
6481

    
6482
uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6483
{
6484
    int64_t v;
6485
    uint32 res;
6486

    
6487
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6488
    if (v < 0) {
6489
        res = 0;
6490
        float_raise( float_flag_invalid STATUS_VAR);
6491
    } else if (v > 0xffffffff) {
6492
        res = 0xffffffff;
6493
        float_raise( float_flag_invalid STATUS_VAR);
6494
    } else {
6495
        res = v;
6496
    }
6497
    return res;
6498
}
6499

    
6500
uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6501
{
6502
    int64_t v;
6503
    uint_fast16_t res;
6504

    
6505
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
6506
    if (v < 0) {
6507
        res = 0;
6508
        float_raise( float_flag_invalid STATUS_VAR);
6509
    } else if (v > 0xffff) {
6510
        res = 0xffff;
6511
        float_raise( float_flag_invalid STATUS_VAR);
6512
    } else {
6513
        res = v;
6514
    }
6515
    return res;
6516
}
6517

    
6518
/* FIXME: This looks broken.  */
6519
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6520
{
6521
    int64_t v;
6522

    
6523
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6524
    v += float64_val(a);
6525
    v = float64_to_int64(make_float64(v) STATUS_VAR);
6526

    
6527
    return v - INT64_MIN;
6528
}
6529

    
6530
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6531
{
6532
    int64_t v;
6533

    
6534
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6535
    v += float64_val(a);
6536
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6537

    
6538
    return v - INT64_MIN;
6539
}
6540

    
6541
#define COMPARE(s, nan_exp)                                                  \
6542
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6543
                                      int is_quiet STATUS_PARAM )            \
6544
{                                                                            \
6545
    flag aSign, bSign;                                                       \
6546
    uint ## s ## _t av, bv;                                                  \
6547
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6548
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6549
                                                                             \
6550
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6551
         extractFloat ## s ## Frac( a ) ) ||                                 \
6552
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6553
          extractFloat ## s ## Frac( b ) )) {                                \
6554
        if (!is_quiet ||                                                     \
6555
            float ## s ## _is_signaling_nan( a ) ||                          \
6556
            float ## s ## _is_signaling_nan( b ) ) {                         \
6557
            float_raise( float_flag_invalid STATUS_VAR);                     \
6558
        }                                                                    \
6559
        return float_relation_unordered;                                     \
6560
    }                                                                        \
6561
    aSign = extractFloat ## s ## Sign( a );                                  \
6562
    bSign = extractFloat ## s ## Sign( b );                                  \
6563
    av = float ## s ## _val(a);                                              \
6564
    bv = float ## s ## _val(b);                                              \
6565
    if ( aSign != bSign ) {                                                  \
6566
        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6567
            /* zero case */                                                  \
6568
            return float_relation_equal;                                     \
6569
        } else {                                                             \
6570
            return 1 - (2 * aSign);                                          \
6571
        }                                                                    \
6572
    } else {                                                                 \
6573
        if (av == bv) {                                                      \
6574
            return float_relation_equal;                                     \
6575
        } else {                                                             \
6576
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
6577
        }                                                                    \
6578
    }                                                                        \
6579
}                                                                            \
6580
                                                                             \
6581
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6582
{                                                                            \
6583
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6584
}                                                                            \
6585
                                                                             \
6586
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6587
{                                                                            \
6588
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6589
}
6590

    
6591
COMPARE(32, 0xff)
6592
COMPARE(64, 0x7ff)
6593

    
6594
INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6595
                                      int is_quiet STATUS_PARAM )
6596
{
6597
    flag aSign, bSign;
6598

    
6599
    if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6600
          ( extractFloatx80Frac( a )<<1 ) ) ||
6601
        ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6602
          ( extractFloatx80Frac( b )<<1 ) )) {
6603
        if (!is_quiet ||
6604
            floatx80_is_signaling_nan( a ) ||
6605
            floatx80_is_signaling_nan( b ) ) {
6606
            float_raise( float_flag_invalid STATUS_VAR);
6607
        }
6608
        return float_relation_unordered;
6609
    }
6610
    aSign = extractFloatx80Sign( a );
6611
    bSign = extractFloatx80Sign( b );
6612
    if ( aSign != bSign ) {
6613

    
6614
        if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6615
             ( ( a.low | b.low ) == 0 ) ) {
6616
            /* zero case */
6617
            return float_relation_equal;
6618
        } else {
6619
            return 1 - (2 * aSign);
6620
        }
6621
    } else {
6622
        if (a.low == b.low && a.high == b.high) {
6623
            return float_relation_equal;
6624
        } else {
6625
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6626
        }
6627
    }
6628
}
6629

    
6630
int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6631
{
6632
    return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6633
}
6634

    
6635
int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6636
{
6637
    return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6638
}
6639

    
6640
INLINE int float128_compare_internal( float128 a, float128 b,
6641
                                      int is_quiet STATUS_PARAM )
6642
{
6643
    flag aSign, bSign;
6644

    
6645
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6646
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6647
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6648
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6649
        if (!is_quiet ||
6650
            float128_is_signaling_nan( a ) ||
6651
            float128_is_signaling_nan( b ) ) {
6652
            float_raise( float_flag_invalid STATUS_VAR);
6653
        }
6654
        return float_relation_unordered;
6655
    }
6656
    aSign = extractFloat128Sign( a );
6657
    bSign = extractFloat128Sign( b );
6658
    if ( aSign != bSign ) {
6659
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6660
            /* zero case */
6661
            return float_relation_equal;
6662
        } else {
6663
            return 1 - (2 * aSign);
6664
        }
6665
    } else {
6666
        if (a.low == b.low && a.high == b.high) {
6667
            return float_relation_equal;
6668
        } else {
6669
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6670
        }
6671
    }
6672
}
6673

    
6674
int float128_compare( float128 a, float128 b STATUS_PARAM )
6675
{
6676
    return float128_compare_internal(a, b, 0 STATUS_VAR);
6677
}
6678

    
6679
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6680
{
6681
    return float128_compare_internal(a, b, 1 STATUS_VAR);
6682
}
6683

    
6684
/* min() and max() functions. These can't be implemented as
6685
 * 'compare and pick one input' because that would mishandle
6686
 * NaNs and +0 vs -0.
6687
 */
6688
#define MINMAX(s, nan_exp)                                              \
6689
INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6690
                                        int ismin STATUS_PARAM )        \
6691
{                                                                       \
6692
    flag aSign, bSign;                                                  \
6693
    uint ## s ## _t av, bv;                                             \
6694
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6695
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6696
    if (float ## s ## _is_any_nan(a) ||                                 \
6697
        float ## s ## _is_any_nan(b)) {                                 \
6698
        return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6699
    }                                                                   \
6700
    aSign = extractFloat ## s ## Sign(a);                               \
6701
    bSign = extractFloat ## s ## Sign(b);                               \
6702
    av = float ## s ## _val(a);                                         \
6703
    bv = float ## s ## _val(b);                                         \
6704
    if (aSign != bSign) {                                               \
6705
        if (ismin) {                                                    \
6706
            return aSign ? a : b;                                       \
6707
        } else {                                                        \
6708
            return aSign ? b : a;                                       \
6709
        }                                                               \
6710
    } else {                                                            \
6711
        if (ismin) {                                                    \
6712
            return (aSign ^ (av < bv)) ? a : b;                         \
6713
        } else {                                                        \
6714
            return (aSign ^ (av < bv)) ? b : a;                         \
6715
        }                                                               \
6716
    }                                                                   \
6717
}                                                                       \
6718
                                                                        \
6719
float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6720
{                                                                       \
6721
    return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6722
}                                                                       \
6723
                                                                        \
6724
float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6725
{                                                                       \
6726
    return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6727
}
6728

    
6729
MINMAX(32, 0xff)
6730
MINMAX(64, 0x7ff)
6731

    
6732

    
6733
/* Multiply A by 2 raised to the power N.  */
6734
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6735
{
6736
    flag aSign;
6737
    int16_t aExp;
6738
    uint32_t aSig;
6739

    
6740
    a = float32_squash_input_denormal(a STATUS_VAR);
6741
    aSig = extractFloat32Frac( a );
6742
    aExp = extractFloat32Exp( a );
6743
    aSign = extractFloat32Sign( a );
6744

    
6745
    if ( aExp == 0xFF ) {
6746
        if ( aSig ) {
6747
            return propagateFloat32NaN( a, a STATUS_VAR );
6748
        }
6749
        return a;
6750
    }
6751
    if ( aExp != 0 )
6752
        aSig |= 0x00800000;
6753
    else if ( aSig == 0 )
6754
        return a;
6755

    
6756
    if (n > 0x200) {
6757
        n = 0x200;
6758
    } else if (n < -0x200) {
6759
        n = -0x200;
6760
    }
6761

    
6762
    aExp += n - 1;
6763
    aSig <<= 7;
6764
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6765
}
6766

    
6767
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6768
{
6769
    flag aSign;
6770
    int16_t aExp;
6771
    uint64_t aSig;
6772

    
6773
    a = float64_squash_input_denormal(a STATUS_VAR);
6774
    aSig = extractFloat64Frac( a );
6775
    aExp = extractFloat64Exp( a );
6776
    aSign = extractFloat64Sign( a );
6777

    
6778
    if ( aExp == 0x7FF ) {
6779
        if ( aSig ) {
6780
            return propagateFloat64NaN( a, a STATUS_VAR );
6781
        }
6782
        return a;
6783
    }
6784
    if ( aExp != 0 )
6785
        aSig |= LIT64( 0x0010000000000000 );
6786
    else if ( aSig == 0 )
6787
        return a;
6788

    
6789
    if (n > 0x1000) {
6790
        n = 0x1000;
6791
    } else if (n < -0x1000) {
6792
        n = -0x1000;
6793
    }
6794

    
6795
    aExp += n - 1;
6796
    aSig <<= 10;
6797
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6798
}
6799

    
6800
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6801
{
6802
    flag aSign;
6803
    int32_t aExp;
6804
    uint64_t aSig;
6805

    
6806
    aSig = extractFloatx80Frac( a );
6807
    aExp = extractFloatx80Exp( a );
6808
    aSign = extractFloatx80Sign( a );
6809

    
6810
    if ( aExp == 0x7FFF ) {
6811
        if ( aSig<<1 ) {
6812
            return propagateFloatx80NaN( a, a STATUS_VAR );
6813
        }
6814
        return a;
6815
    }
6816

    
6817
    if (aExp == 0 && aSig == 0)
6818
        return a;
6819

    
6820
    if (n > 0x10000) {
6821
        n = 0x10000;
6822
    } else if (n < -0x10000) {
6823
        n = -0x10000;
6824
    }
6825

    
6826
    aExp += n;
6827
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6828
                                          aSign, aExp, aSig, 0 STATUS_VAR );
6829
}
6830

    
6831
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6832
{
6833
    flag aSign;
6834
    int32_t aExp;
6835
    uint64_t aSig0, aSig1;
6836

    
6837
    aSig1 = extractFloat128Frac1( a );
6838
    aSig0 = extractFloat128Frac0( a );
6839
    aExp = extractFloat128Exp( a );
6840
    aSign = extractFloat128Sign( a );
6841
    if ( aExp == 0x7FFF ) {
6842
        if ( aSig0 | aSig1 ) {
6843
            return propagateFloat128NaN( a, a STATUS_VAR );
6844
        }
6845
        return a;
6846
    }
6847
    if ( aExp != 0 )
6848
        aSig0 |= LIT64( 0x0001000000000000 );
6849
    else if ( aSig0 == 0 && aSig1 == 0 )
6850
        return a;
6851

    
6852
    if (n > 0x10000) {
6853
        n = 0x10000;
6854
    } else if (n < -0x10000) {
6855
        n = -0x10000;
6856
    }
6857

    
6858
    aExp += n - 1;
6859
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6860
                                          STATUS_VAR );
6861

    
6862
}