Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 99036865

History | View | Annotate | Download (214.7 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
#include "softfloat.h"
34

    
35
/*----------------------------------------------------------------------------
36
| Primitive arithmetic functions, including multi-word arithmetic, and
37
| division and square root approximations.  (Can be specialized to target if
38
| desired.)
39
*----------------------------------------------------------------------------*/
40
#include "softfloat-macros.h"
41

    
42
/*----------------------------------------------------------------------------
43
| Functions and definitions to determine:  (1) whether tininess for underflow
44
| is detected before or after rounding by default, (2) what (if anything)
45
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47
| are propagated from function inputs to output.  These details are target-
48
| specific.
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-specialize.h"
51

    
52
void set_float_rounding_mode(int val STATUS_PARAM)
53
{
54
    STATUS(float_rounding_mode) = val;
55
}
56

    
57
void set_float_exception_flags(int val STATUS_PARAM)
58
{
59
    STATUS(float_exception_flags) = val;
60
}
61

    
62
#ifdef FLOATX80
63
void set_floatx80_rounding_precision(int val STATUS_PARAM)
64
{
65
    STATUS(floatx80_rounding_precision) = val;
66
}
67
#endif
68

    
69
/*----------------------------------------------------------------------------
70
| Returns the fraction bits of the half-precision floating-point value `a'.
71
*----------------------------------------------------------------------------*/
72

    
73
INLINE uint32_t extractFloat16Frac(float16 a)
74
{
75
    return float16_val(a) & 0x3ff;
76
}
77

    
78
/*----------------------------------------------------------------------------
79
| Returns the exponent bits of the half-precision floating-point value `a'.
80
*----------------------------------------------------------------------------*/
81

    
82
INLINE int16 extractFloat16Exp(float16 a)
83
{
84
    return (float16_val(a) >> 10) & 0x1f;
85
}
86

    
87
/*----------------------------------------------------------------------------
88
| Returns the sign bit of the single-precision floating-point value `a'.
89
*----------------------------------------------------------------------------*/
90

    
91
INLINE flag extractFloat16Sign(float16 a)
92
{
93
    return float16_val(a)>>15;
94
}
95

    
96
/*----------------------------------------------------------------------------
97
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98
| and 7, and returns the properly rounded 32-bit integer corresponding to the
99
| input.  If `zSign' is 1, the input is negated before being converted to an
100
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
101
| is simply rounded to an integer, with the inexact exception raised if the
102
| input cannot be represented exactly as an integer.  However, if the fixed-
103
| point input is too large, the invalid exception is raised and the largest
104
| positive or negative integer is returned.
105
*----------------------------------------------------------------------------*/
106

    
107
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
108
{
109
    int8 roundingMode;
110
    flag roundNearestEven;
111
    int8 roundIncrement, roundBits;
112
    int32 z;
113

    
114
    roundingMode = STATUS(float_rounding_mode);
115
    roundNearestEven = ( roundingMode == float_round_nearest_even );
116
    roundIncrement = 0x40;
117
    if ( ! roundNearestEven ) {
118
        if ( roundingMode == float_round_to_zero ) {
119
            roundIncrement = 0;
120
        }
121
        else {
122
            roundIncrement = 0x7F;
123
            if ( zSign ) {
124
                if ( roundingMode == float_round_up ) roundIncrement = 0;
125
            }
126
            else {
127
                if ( roundingMode == float_round_down ) roundIncrement = 0;
128
            }
129
        }
130
    }
131
    roundBits = absZ & 0x7F;
132
    absZ = ( absZ + roundIncrement )>>7;
133
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
134
    z = absZ;
135
    if ( zSign ) z = - z;
136
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
137
        float_raise( float_flag_invalid STATUS_VAR);
138
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
139
    }
140
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
141
    return z;
142

    
143
}
144

    
145
/*----------------------------------------------------------------------------
146
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147
| `absZ1', with binary point between bits 63 and 64 (between the input words),
148
| and returns the properly rounded 64-bit integer corresponding to the input.
149
| If `zSign' is 1, the input is negated before being converted to an integer.
150
| Ordinarily, the fixed-point input is simply rounded to an integer, with
151
| the inexact exception raised if the input cannot be represented exactly as
152
| an integer.  However, if the fixed-point input is too large, the invalid
153
| exception is raised and the largest positive or negative integer is
154
| returned.
155
*----------------------------------------------------------------------------*/
156

    
157
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
158
{
159
    int8 roundingMode;
160
    flag roundNearestEven, increment;
161
    int64 z;
162

    
163
    roundingMode = STATUS(float_rounding_mode);
164
    roundNearestEven = ( roundingMode == float_round_nearest_even );
165
    increment = ( (sbits64) absZ1 < 0 );
166
    if ( ! roundNearestEven ) {
167
        if ( roundingMode == float_round_to_zero ) {
168
            increment = 0;
169
        }
170
        else {
171
            if ( zSign ) {
172
                increment = ( roundingMode == float_round_down ) && absZ1;
173
            }
174
            else {
175
                increment = ( roundingMode == float_round_up ) && absZ1;
176
            }
177
        }
178
    }
179
    if ( increment ) {
180
        ++absZ0;
181
        if ( absZ0 == 0 ) goto overflow;
182
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
183
    }
184
    z = absZ0;
185
    if ( zSign ) z = - z;
186
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
187
 overflow:
188
        float_raise( float_flag_invalid STATUS_VAR);
189
        return
190
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
191
            : LIT64( 0x7FFFFFFFFFFFFFFF );
192
    }
193
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
194
    return z;
195

    
196
}
197

    
198
/*----------------------------------------------------------------------------
199
| Returns the fraction bits of the single-precision floating-point value `a'.
200
*----------------------------------------------------------------------------*/
201

    
202
INLINE bits32 extractFloat32Frac( float32 a )
203
{
204

    
205
    return float32_val(a) & 0x007FFFFF;
206

    
207
}
208

    
209
/*----------------------------------------------------------------------------
210
| Returns the exponent bits of the single-precision floating-point value `a'.
211
*----------------------------------------------------------------------------*/
212

    
213
INLINE int16 extractFloat32Exp( float32 a )
214
{
215

    
216
    return ( float32_val(a)>>23 ) & 0xFF;
217

    
218
}
219

    
220
/*----------------------------------------------------------------------------
221
| Returns the sign bit of the single-precision floating-point value `a'.
222
*----------------------------------------------------------------------------*/
223

    
224
INLINE flag extractFloat32Sign( float32 a )
225
{
226

    
227
    return float32_val(a)>>31;
228

    
229
}
230

    
231
/*----------------------------------------------------------------------------
232
| If `a' is denormal and we are in flush-to-zero mode then set the
233
| input-denormal exception and return zero. Otherwise just return the value.
234
*----------------------------------------------------------------------------*/
235
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
236
{
237
    if (STATUS(flush_inputs_to_zero)) {
238
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
239
            float_raise(float_flag_input_denormal STATUS_VAR);
240
            return make_float32(float32_val(a) & 0x80000000);
241
        }
242
    }
243
    return a;
244
}
245

    
246
/*----------------------------------------------------------------------------
247
| Normalizes the subnormal single-precision floating-point value represented
248
| by the denormalized significand `aSig'.  The normalized exponent and
249
| significand are stored at the locations pointed to by `zExpPtr' and
250
| `zSigPtr', respectively.
251
*----------------------------------------------------------------------------*/
252

    
253
static void
254
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
255
{
256
    int8 shiftCount;
257

    
258
    shiftCount = countLeadingZeros32( aSig ) - 8;
259
    *zSigPtr = aSig<<shiftCount;
260
    *zExpPtr = 1 - shiftCount;
261

    
262
}
263

    
264
/*----------------------------------------------------------------------------
265
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266
| single-precision floating-point value, returning the result.  After being
267
| shifted into the proper positions, the three fields are simply added
268
| together to form the result.  This means that any integer portion of `zSig'
269
| will be added into the exponent.  Since a properly normalized significand
270
| will have an integer portion equal to 1, the `zExp' input should be 1 less
271
| than the desired result exponent whenever `zSig' is a complete, normalized
272
| significand.
273
*----------------------------------------------------------------------------*/
274

    
275
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
276
{
277

    
278
    return make_float32(
279
          ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
280

    
281
}
282

    
283
/*----------------------------------------------------------------------------
284
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285
| and significand `zSig', and returns the proper single-precision floating-
286
| point value corresponding to the abstract input.  Ordinarily, the abstract
287
| value is simply rounded and packed into the single-precision format, with
288
| the inexact exception raised if the abstract input cannot be represented
289
| exactly.  However, if the abstract value is too large, the overflow and
290
| inexact exceptions are raised and an infinity or maximal finite value is
291
| returned.  If the abstract value is too small, the input value is rounded to
292
| a subnormal number, and the underflow and inexact exceptions are raised if
293
| the abstract input cannot be represented exactly as a subnormal single-
294
| precision floating-point number.
295
|     The input significand `zSig' has its binary point between bits 30
296
| and 29, which is 7 bits to the left of the usual location.  This shifted
297
| significand must be normalized or smaller.  If `zSig' is not normalized,
298
| `zExp' must be 0; in that case, the result returned is a subnormal number,
299
| and it must not require rounding.  In the usual case that `zSig' is
300
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301
| The handling of underflow and overflow follows the IEC/IEEE Standard for
302
| Binary Floating-Point Arithmetic.
303
*----------------------------------------------------------------------------*/
304

    
305
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
306
{
307
    int8 roundingMode;
308
    flag roundNearestEven;
309
    int8 roundIncrement, roundBits;
310
    flag isTiny;
311

    
312
    roundingMode = STATUS(float_rounding_mode);
313
    roundNearestEven = ( roundingMode == float_round_nearest_even );
314
    roundIncrement = 0x40;
315
    if ( ! roundNearestEven ) {
316
        if ( roundingMode == float_round_to_zero ) {
317
            roundIncrement = 0;
318
        }
319
        else {
320
            roundIncrement = 0x7F;
321
            if ( zSign ) {
322
                if ( roundingMode == float_round_up ) roundIncrement = 0;
323
            }
324
            else {
325
                if ( roundingMode == float_round_down ) roundIncrement = 0;
326
            }
327
        }
328
    }
329
    roundBits = zSig & 0x7F;
330
    if ( 0xFD <= (bits16) zExp ) {
331
        if (    ( 0xFD < zExp )
332
             || (    ( zExp == 0xFD )
333
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
334
           ) {
335
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
336
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
337
        }
338
        if ( zExp < 0 ) {
339
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
340
            isTiny =
341
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
342
                || ( zExp < -1 )
343
                || ( zSig + roundIncrement < 0x80000000 );
344
            shift32RightJamming( zSig, - zExp, &zSig );
345
            zExp = 0;
346
            roundBits = zSig & 0x7F;
347
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
348
        }
349
    }
350
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
351
    zSig = ( zSig + roundIncrement )>>7;
352
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
353
    if ( zSig == 0 ) zExp = 0;
354
    return packFloat32( zSign, zExp, zSig );
355

    
356
}
357

    
358
/*----------------------------------------------------------------------------
359
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360
| and significand `zSig', and returns the proper single-precision floating-
361
| point value corresponding to the abstract input.  This routine is just like
362
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364
| floating-point exponent.
365
*----------------------------------------------------------------------------*/
366

    
367
static float32
368
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
369
{
370
    int8 shiftCount;
371

    
372
    shiftCount = countLeadingZeros32( zSig ) - 1;
373
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
374

    
375
}
376

    
377
/*----------------------------------------------------------------------------
378
| Returns the fraction bits of the double-precision floating-point value `a'.
379
*----------------------------------------------------------------------------*/
380

    
381
INLINE bits64 extractFloat64Frac( float64 a )
382
{
383

    
384
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
385

    
386
}
387

    
388
/*----------------------------------------------------------------------------
389
| Returns the exponent bits of the double-precision floating-point value `a'.
390
*----------------------------------------------------------------------------*/
391

    
392
INLINE int16 extractFloat64Exp( float64 a )
393
{
394

    
395
    return ( float64_val(a)>>52 ) & 0x7FF;
396

    
397
}
398

    
399
/*----------------------------------------------------------------------------
400
| Returns the sign bit of the double-precision floating-point value `a'.
401
*----------------------------------------------------------------------------*/
402

    
403
INLINE flag extractFloat64Sign( float64 a )
404
{
405

    
406
    return float64_val(a)>>63;
407

    
408
}
409

    
410
/*----------------------------------------------------------------------------
411
| If `a' is denormal and we are in flush-to-zero mode then set the
412
| input-denormal exception and return zero. Otherwise just return the value.
413
*----------------------------------------------------------------------------*/
414
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
415
{
416
    if (STATUS(flush_inputs_to_zero)) {
417
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
418
            float_raise(float_flag_input_denormal STATUS_VAR);
419
            return make_float64(float64_val(a) & (1ULL << 63));
420
        }
421
    }
422
    return a;
423
}
424

    
425
/*----------------------------------------------------------------------------
426
| Normalizes the subnormal double-precision floating-point value represented
427
| by the denormalized significand `aSig'.  The normalized exponent and
428
| significand are stored at the locations pointed to by `zExpPtr' and
429
| `zSigPtr', respectively.
430
*----------------------------------------------------------------------------*/
431

    
432
static void
433
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
434
{
435
    int8 shiftCount;
436

    
437
    shiftCount = countLeadingZeros64( aSig ) - 11;
438
    *zSigPtr = aSig<<shiftCount;
439
    *zExpPtr = 1 - shiftCount;
440

    
441
}
442

    
443
/*----------------------------------------------------------------------------
444
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445
| double-precision floating-point value, returning the result.  After being
446
| shifted into the proper positions, the three fields are simply added
447
| together to form the result.  This means that any integer portion of `zSig'
448
| will be added into the exponent.  Since a properly normalized significand
449
| will have an integer portion equal to 1, the `zExp' input should be 1 less
450
| than the desired result exponent whenever `zSig' is a complete, normalized
451
| significand.
452
*----------------------------------------------------------------------------*/
453

    
454
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
455
{
456

    
457
    return make_float64(
458
        ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
459

    
460
}
461

    
462
/*----------------------------------------------------------------------------
463
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464
| and significand `zSig', and returns the proper double-precision floating-
465
| point value corresponding to the abstract input.  Ordinarily, the abstract
466
| value is simply rounded and packed into the double-precision format, with
467
| the inexact exception raised if the abstract input cannot be represented
468
| exactly.  However, if the abstract value is too large, the overflow and
469
| inexact exceptions are raised and an infinity or maximal finite value is
470
| returned.  If the abstract value is too small, the input value is rounded
471
| to a subnormal number, and the underflow and inexact exceptions are raised
472
| if the abstract input cannot be represented exactly as a subnormal double-
473
| precision floating-point number.
474
|     The input significand `zSig' has its binary point between bits 62
475
| and 61, which is 10 bits to the left of the usual location.  This shifted
476
| significand must be normalized or smaller.  If `zSig' is not normalized,
477
| `zExp' must be 0; in that case, the result returned is a subnormal number,
478
| and it must not require rounding.  In the usual case that `zSig' is
479
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480
| The handling of underflow and overflow follows the IEC/IEEE Standard for
481
| Binary Floating-Point Arithmetic.
482
*----------------------------------------------------------------------------*/
483

    
484
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
485
{
486
    int8 roundingMode;
487
    flag roundNearestEven;
488
    int16 roundIncrement, roundBits;
489
    flag isTiny;
490

    
491
    roundingMode = STATUS(float_rounding_mode);
492
    roundNearestEven = ( roundingMode == float_round_nearest_even );
493
    roundIncrement = 0x200;
494
    if ( ! roundNearestEven ) {
495
        if ( roundingMode == float_round_to_zero ) {
496
            roundIncrement = 0;
497
        }
498
        else {
499
            roundIncrement = 0x3FF;
500
            if ( zSign ) {
501
                if ( roundingMode == float_round_up ) roundIncrement = 0;
502
            }
503
            else {
504
                if ( roundingMode == float_round_down ) roundIncrement = 0;
505
            }
506
        }
507
    }
508
    roundBits = zSig & 0x3FF;
509
    if ( 0x7FD <= (bits16) zExp ) {
510
        if (    ( 0x7FD < zExp )
511
             || (    ( zExp == 0x7FD )
512
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
513
           ) {
514
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
515
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
516
        }
517
        if ( zExp < 0 ) {
518
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
519
            isTiny =
520
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
521
                || ( zExp < -1 )
522
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
523
            shift64RightJamming( zSig, - zExp, &zSig );
524
            zExp = 0;
525
            roundBits = zSig & 0x3FF;
526
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
527
        }
528
    }
529
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
530
    zSig = ( zSig + roundIncrement )>>10;
531
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
532
    if ( zSig == 0 ) zExp = 0;
533
    return packFloat64( zSign, zExp, zSig );
534

    
535
}
536

    
537
/*----------------------------------------------------------------------------
538
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539
| and significand `zSig', and returns the proper double-precision floating-
540
| point value corresponding to the abstract input.  This routine is just like
541
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543
| floating-point exponent.
544
*----------------------------------------------------------------------------*/
545

    
546
static float64
547
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
548
{
549
    int8 shiftCount;
550

    
551
    shiftCount = countLeadingZeros64( zSig ) - 1;
552
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
553

    
554
}
555

    
556
#ifdef FLOATX80
557

    
558
/*----------------------------------------------------------------------------
559
| Returns the fraction bits of the extended double-precision floating-point
560
| value `a'.
561
*----------------------------------------------------------------------------*/
562

    
563
INLINE bits64 extractFloatx80Frac( floatx80 a )
564
{
565

    
566
    return a.low;
567

    
568
}
569

    
570
/*----------------------------------------------------------------------------
571
| Returns the exponent bits of the extended double-precision floating-point
572
| value `a'.
573
*----------------------------------------------------------------------------*/
574

    
575
INLINE int32 extractFloatx80Exp( floatx80 a )
576
{
577

    
578
    return a.high & 0x7FFF;
579

    
580
}
581

    
582
/*----------------------------------------------------------------------------
583
| Returns the sign bit of the extended double-precision floating-point value
584
| `a'.
585
*----------------------------------------------------------------------------*/
586

    
587
INLINE flag extractFloatx80Sign( floatx80 a )
588
{
589

    
590
    return a.high>>15;
591

    
592
}
593

    
594
/*----------------------------------------------------------------------------
595
| Normalizes the subnormal extended double-precision floating-point value
596
| represented by the denormalized significand `aSig'.  The normalized exponent
597
| and significand are stored at the locations pointed to by `zExpPtr' and
598
| `zSigPtr', respectively.
599
*----------------------------------------------------------------------------*/
600

    
601
static void
602
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
603
{
604
    int8 shiftCount;
605

    
606
    shiftCount = countLeadingZeros64( aSig );
607
    *zSigPtr = aSig<<shiftCount;
608
    *zExpPtr = 1 - shiftCount;
609

    
610
}
611

    
612
/*----------------------------------------------------------------------------
613
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614
| extended double-precision floating-point value, returning the result.
615
*----------------------------------------------------------------------------*/
616

    
617
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
618
{
619
    floatx80 z;
620

    
621
    z.low = zSig;
622
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
623
    return z;
624

    
625
}
626

    
627
/*----------------------------------------------------------------------------
628
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
630
| and returns the proper extended double-precision floating-point value
631
| corresponding to the abstract input.  Ordinarily, the abstract value is
632
| rounded and packed into the extended double-precision format, with the
633
| inexact exception raised if the abstract input cannot be represented
634
| exactly.  However, if the abstract value is too large, the overflow and
635
| inexact exceptions are raised and an infinity or maximal finite value is
636
| returned.  If the abstract value is too small, the input value is rounded to
637
| a subnormal number, and the underflow and inexact exceptions are raised if
638
| the abstract input cannot be represented exactly as a subnormal extended
639
| double-precision floating-point number.
640
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
641
| number of bits as single or double precision, respectively.  Otherwise, the
642
| result is rounded to the full precision of the extended double-precision
643
| format.
644
|     The input significand must be normalized or smaller.  If the input
645
| significand is not normalized, `zExp' must be 0; in that case, the result
646
| returned is a subnormal number, and it must not require rounding.  The
647
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648
| Floating-Point Arithmetic.
649
*----------------------------------------------------------------------------*/
650

    
651
static floatx80
652
 roundAndPackFloatx80(
653
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
654
 STATUS_PARAM)
655
{
656
    int8 roundingMode;
657
    flag roundNearestEven, increment, isTiny;
658
    int64 roundIncrement, roundMask, roundBits;
659

    
660
    roundingMode = STATUS(float_rounding_mode);
661
    roundNearestEven = ( roundingMode == float_round_nearest_even );
662
    if ( roundingPrecision == 80 ) goto precision80;
663
    if ( roundingPrecision == 64 ) {
664
        roundIncrement = LIT64( 0x0000000000000400 );
665
        roundMask = LIT64( 0x00000000000007FF );
666
    }
667
    else if ( roundingPrecision == 32 ) {
668
        roundIncrement = LIT64( 0x0000008000000000 );
669
        roundMask = LIT64( 0x000000FFFFFFFFFF );
670
    }
671
    else {
672
        goto precision80;
673
    }
674
    zSig0 |= ( zSig1 != 0 );
675
    if ( ! roundNearestEven ) {
676
        if ( roundingMode == float_round_to_zero ) {
677
            roundIncrement = 0;
678
        }
679
        else {
680
            roundIncrement = roundMask;
681
            if ( zSign ) {
682
                if ( roundingMode == float_round_up ) roundIncrement = 0;
683
            }
684
            else {
685
                if ( roundingMode == float_round_down ) roundIncrement = 0;
686
            }
687
        }
688
    }
689
    roundBits = zSig0 & roundMask;
690
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691
        if (    ( 0x7FFE < zExp )
692
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
693
           ) {
694
            goto overflow;
695
        }
696
        if ( zExp <= 0 ) {
697
            if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
698
            isTiny =
699
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
700
                || ( zExp < 0 )
701
                || ( zSig0 <= zSig0 + roundIncrement );
702
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
703
            zExp = 0;
704
            roundBits = zSig0 & roundMask;
705
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
706
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
707
            zSig0 += roundIncrement;
708
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
709
            roundIncrement = roundMask + 1;
710
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
711
                roundMask |= roundIncrement;
712
            }
713
            zSig0 &= ~ roundMask;
714
            return packFloatx80( zSign, zExp, zSig0 );
715
        }
716
    }
717
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
718
    zSig0 += roundIncrement;
719
    if ( zSig0 < roundIncrement ) {
720
        ++zExp;
721
        zSig0 = LIT64( 0x8000000000000000 );
722
    }
723
    roundIncrement = roundMask + 1;
724
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
725
        roundMask |= roundIncrement;
726
    }
727
    zSig0 &= ~ roundMask;
728
    if ( zSig0 == 0 ) zExp = 0;
729
    return packFloatx80( zSign, zExp, zSig0 );
730
 precision80:
731
    increment = ( (sbits64) zSig1 < 0 );
732
    if ( ! roundNearestEven ) {
733
        if ( roundingMode == float_round_to_zero ) {
734
            increment = 0;
735
        }
736
        else {
737
            if ( zSign ) {
738
                increment = ( roundingMode == float_round_down ) && zSig1;
739
            }
740
            else {
741
                increment = ( roundingMode == float_round_up ) && zSig1;
742
            }
743
        }
744
    }
745
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
746
        if (    ( 0x7FFE < zExp )
747
             || (    ( zExp == 0x7FFE )
748
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
749
                  && increment
750
                )
751
           ) {
752
            roundMask = 0;
753
 overflow:
754
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
755
            if (    ( roundingMode == float_round_to_zero )
756
                 || ( zSign && ( roundingMode == float_round_up ) )
757
                 || ( ! zSign && ( roundingMode == float_round_down ) )
758
               ) {
759
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
760
            }
761
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
762
        }
763
        if ( zExp <= 0 ) {
764
            isTiny =
765
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
766
                || ( zExp < 0 )
767
                || ! increment
768
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
769
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
770
            zExp = 0;
771
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
772
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
773
            if ( roundNearestEven ) {
774
                increment = ( (sbits64) zSig1 < 0 );
775
            }
776
            else {
777
                if ( zSign ) {
778
                    increment = ( roundingMode == float_round_down ) && zSig1;
779
                }
780
                else {
781
                    increment = ( roundingMode == float_round_up ) && zSig1;
782
                }
783
            }
784
            if ( increment ) {
785
                ++zSig0;
786
                zSig0 &=
787
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
788
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
789
            }
790
            return packFloatx80( zSign, zExp, zSig0 );
791
        }
792
    }
793
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
794
    if ( increment ) {
795
        ++zSig0;
796
        if ( zSig0 == 0 ) {
797
            ++zExp;
798
            zSig0 = LIT64( 0x8000000000000000 );
799
        }
800
        else {
801
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
802
        }
803
    }
804
    else {
805
        if ( zSig0 == 0 ) zExp = 0;
806
    }
807
    return packFloatx80( zSign, zExp, zSig0 );
808

    
809
}
810

    
811
/*----------------------------------------------------------------------------
812
| Takes an abstract floating-point value having sign `zSign', exponent
813
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814
| and returns the proper extended double-precision floating-point value
815
| corresponding to the abstract input.  This routine is just like
816
| `roundAndPackFloatx80' except that the input significand does not have to be
817
| normalized.
818
*----------------------------------------------------------------------------*/
819

    
820
static floatx80
821
 normalizeRoundAndPackFloatx80(
822
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
823
 STATUS_PARAM)
824
{
825
    int8 shiftCount;
826

    
827
    if ( zSig0 == 0 ) {
828
        zSig0 = zSig1;
829
        zSig1 = 0;
830
        zExp -= 64;
831
    }
832
    shiftCount = countLeadingZeros64( zSig0 );
833
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
834
    zExp -= shiftCount;
835
    return
836
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
837

    
838
}
839

    
840
#endif
841

    
842
#ifdef FLOAT128
843

    
844
/*----------------------------------------------------------------------------
845
| Returns the least-significant 64 fraction bits of the quadruple-precision
846
| floating-point value `a'.
847
*----------------------------------------------------------------------------*/
848

    
849
INLINE bits64 extractFloat128Frac1( float128 a )
850
{
851

    
852
    return a.low;
853

    
854
}
855

    
856
/*----------------------------------------------------------------------------
857
| Returns the most-significant 48 fraction bits of the quadruple-precision
858
| floating-point value `a'.
859
*----------------------------------------------------------------------------*/
860

    
861
INLINE bits64 extractFloat128Frac0( float128 a )
862
{
863

    
864
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
865

    
866
}
867

    
868
/*----------------------------------------------------------------------------
869
| Returns the exponent bits of the quadruple-precision floating-point value
870
| `a'.
871
*----------------------------------------------------------------------------*/
872

    
873
INLINE int32 extractFloat128Exp( float128 a )
874
{
875

    
876
    return ( a.high>>48 ) & 0x7FFF;
877

    
878
}
879

    
880
/*----------------------------------------------------------------------------
881
| Returns the sign bit of the quadruple-precision floating-point value `a'.
882
*----------------------------------------------------------------------------*/
883

    
884
INLINE flag extractFloat128Sign( float128 a )
885
{
886

    
887
    return a.high>>63;
888

    
889
}
890

    
891
/*----------------------------------------------------------------------------
892
| Normalizes the subnormal quadruple-precision floating-point value
893
| represented by the denormalized significand formed by the concatenation of
894
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
895
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
896
| significand are stored at the location pointed to by `zSig0Ptr', and the
897
| least significant 64 bits of the normalized significand are stored at the
898
| location pointed to by `zSig1Ptr'.
899
*----------------------------------------------------------------------------*/
900

    
901
static void
902
 normalizeFloat128Subnormal(
903
     bits64 aSig0,
904
     bits64 aSig1,
905
     int32 *zExpPtr,
906
     bits64 *zSig0Ptr,
907
     bits64 *zSig1Ptr
908
 )
909
{
910
    int8 shiftCount;
911

    
912
    if ( aSig0 == 0 ) {
913
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
914
        if ( shiftCount < 0 ) {
915
            *zSig0Ptr = aSig1>>( - shiftCount );
916
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
917
        }
918
        else {
919
            *zSig0Ptr = aSig1<<shiftCount;
920
            *zSig1Ptr = 0;
921
        }
922
        *zExpPtr = - shiftCount - 63;
923
    }
924
    else {
925
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
926
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927
        *zExpPtr = 1 - shiftCount;
928
    }
929

    
930
}
931

    
932
/*----------------------------------------------------------------------------
933
| Packs the sign `zSign', the exponent `zExp', and the significand formed
934
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935
| floating-point value, returning the result.  After being shifted into the
936
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937
| added together to form the most significant 32 bits of the result.  This
938
| means that any integer portion of `zSig0' will be added into the exponent.
939
| Since a properly normalized significand will have an integer portion equal
940
| to 1, the `zExp' input should be 1 less than the desired result exponent
941
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
942
| significand.
943
*----------------------------------------------------------------------------*/
944

    
945
INLINE float128
946
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
947
{
948
    float128 z;
949

    
950
    z.low = zSig1;
951
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
952
    return z;
953

    
954
}
955

    
956
/*----------------------------------------------------------------------------
957
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958
| and extended significand formed by the concatenation of `zSig0', `zSig1',
959
| and `zSig2', and returns the proper quadruple-precision floating-point value
960
| corresponding to the abstract input.  Ordinarily, the abstract value is
961
| simply rounded and packed into the quadruple-precision format, with the
962
| inexact exception raised if the abstract input cannot be represented
963
| exactly.  However, if the abstract value is too large, the overflow and
964
| inexact exceptions are raised and an infinity or maximal finite value is
965
| returned.  If the abstract value is too small, the input value is rounded to
966
| a subnormal number, and the underflow and inexact exceptions are raised if
967
| the abstract input cannot be represented exactly as a subnormal quadruple-
968
| precision floating-point number.
969
|     The input significand must be normalized or smaller.  If the input
970
| significand is not normalized, `zExp' must be 0; in that case, the result
971
| returned is a subnormal number, and it must not require rounding.  In the
972
| usual case that the input significand is normalized, `zExp' must be 1 less
973
| than the ``true'' floating-point exponent.  The handling of underflow and
974
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975
*----------------------------------------------------------------------------*/
976

    
977
static float128
978
 roundAndPackFloat128(
979
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
980
{
981
    int8 roundingMode;
982
    flag roundNearestEven, increment, isTiny;
983

    
984
    roundingMode = STATUS(float_rounding_mode);
985
    roundNearestEven = ( roundingMode == float_round_nearest_even );
986
    increment = ( (sbits64) zSig2 < 0 );
987
    if ( ! roundNearestEven ) {
988
        if ( roundingMode == float_round_to_zero ) {
989
            increment = 0;
990
        }
991
        else {
992
            if ( zSign ) {
993
                increment = ( roundingMode == float_round_down ) && zSig2;
994
            }
995
            else {
996
                increment = ( roundingMode == float_round_up ) && zSig2;
997
            }
998
        }
999
    }
1000
    if ( 0x7FFD <= (bits32) zExp ) {
1001
        if (    ( 0x7FFD < zExp )
1002
             || (    ( zExp == 0x7FFD )
1003
                  && eq128(
1004
                         LIT64( 0x0001FFFFFFFFFFFF ),
1005
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1006
                         zSig0,
1007
                         zSig1
1008
                     )
1009
                  && increment
1010
                )
1011
           ) {
1012
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013
            if (    ( roundingMode == float_round_to_zero )
1014
                 || ( zSign && ( roundingMode == float_round_up ) )
1015
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1016
               ) {
1017
                return
1018
                    packFloat128(
1019
                        zSign,
1020
                        0x7FFE,
1021
                        LIT64( 0x0000FFFFFFFFFFFF ),
1022
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1023
                    );
1024
            }
1025
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1026
        }
1027
        if ( zExp < 0 ) {
1028
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029
            isTiny =
1030
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031
                || ( zExp < -1 )
1032
                || ! increment
1033
                || lt128(
1034
                       zSig0,
1035
                       zSig1,
1036
                       LIT64( 0x0001FFFFFFFFFFFF ),
1037
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1038
                   );
1039
            shift128ExtraRightJamming(
1040
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041
            zExp = 0;
1042
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043
            if ( roundNearestEven ) {
1044
                increment = ( (sbits64) zSig2 < 0 );
1045
            }
1046
            else {
1047
                if ( zSign ) {
1048
                    increment = ( roundingMode == float_round_down ) && zSig2;
1049
                }
1050
                else {
1051
                    increment = ( roundingMode == float_round_up ) && zSig2;
1052
                }
1053
            }
1054
        }
1055
    }
1056
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057
    if ( increment ) {
1058
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1060
    }
1061
    else {
1062
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1063
    }
1064
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1065

    
1066
}
1067

    
1068
/*----------------------------------------------------------------------------
1069
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1071
| returns the proper quadruple-precision floating-point value corresponding
1072
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1073
| except that the input significand has fewer bits and does not have to be
1074
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1075
| point exponent.
1076
*----------------------------------------------------------------------------*/
1077

    
1078
static float128
1079
 normalizeRoundAndPackFloat128(
1080
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1081
{
1082
    int8 shiftCount;
1083
    bits64 zSig2;
1084

    
1085
    if ( zSig0 == 0 ) {
1086
        zSig0 = zSig1;
1087
        zSig1 = 0;
1088
        zExp -= 64;
1089
    }
1090
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091
    if ( 0 <= shiftCount ) {
1092
        zSig2 = 0;
1093
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1094
    }
1095
    else {
1096
        shift128ExtraRightJamming(
1097
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1098
    }
1099
    zExp -= shiftCount;
1100
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1101

    
1102
}
1103

    
1104
#endif
1105

    
1106
/*----------------------------------------------------------------------------
1107
| Returns the result of converting the 32-bit two's complement integer `a'
1108
| to the single-precision floating-point format.  The conversion is performed
1109
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110
*----------------------------------------------------------------------------*/
1111

    
1112
float32 int32_to_float32( int32 a STATUS_PARAM )
1113
{
1114
    flag zSign;
1115

    
1116
    if ( a == 0 ) return float32_zero;
1117
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118
    zSign = ( a < 0 );
1119
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1120

    
1121
}
1122

    
1123
/*----------------------------------------------------------------------------
1124
| Returns the result of converting the 32-bit two's complement integer `a'
1125
| to the double-precision floating-point format.  The conversion is performed
1126
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127
*----------------------------------------------------------------------------*/
1128

    
1129
float64 int32_to_float64( int32 a STATUS_PARAM )
1130
{
1131
    flag zSign;
1132
    uint32 absA;
1133
    int8 shiftCount;
1134
    bits64 zSig;
1135

    
1136
    if ( a == 0 ) return float64_zero;
1137
    zSign = ( a < 0 );
1138
    absA = zSign ? - a : a;
1139
    shiftCount = countLeadingZeros32( absA ) + 21;
1140
    zSig = absA;
1141
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1142

    
1143
}
1144

    
1145
#ifdef FLOATX80
1146

    
1147
/*----------------------------------------------------------------------------
1148
| Returns the result of converting the 32-bit two's complement integer `a'
1149
| to the extended double-precision floating-point format.  The conversion
1150
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151
| Arithmetic.
1152
*----------------------------------------------------------------------------*/
1153

    
1154
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1155
{
1156
    flag zSign;
1157
    uint32 absA;
1158
    int8 shiftCount;
1159
    bits64 zSig;
1160

    
1161
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162
    zSign = ( a < 0 );
1163
    absA = zSign ? - a : a;
1164
    shiftCount = countLeadingZeros32( absA ) + 32;
1165
    zSig = absA;
1166
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1167

    
1168
}
1169

    
1170
#endif
1171

    
1172
#ifdef FLOAT128
1173

    
1174
/*----------------------------------------------------------------------------
1175
| Returns the result of converting the 32-bit two's complement integer `a' to
1176
| the quadruple-precision floating-point format.  The conversion is performed
1177
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178
*----------------------------------------------------------------------------*/
1179

    
1180
float128 int32_to_float128( int32 a STATUS_PARAM )
1181
{
1182
    flag zSign;
1183
    uint32 absA;
1184
    int8 shiftCount;
1185
    bits64 zSig0;
1186

    
1187
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188
    zSign = ( a < 0 );
1189
    absA = zSign ? - a : a;
1190
    shiftCount = countLeadingZeros32( absA ) + 17;
1191
    zSig0 = absA;
1192
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1193

    
1194
}
1195

    
1196
#endif
1197

    
1198
/*----------------------------------------------------------------------------
1199
| Returns the result of converting the 64-bit two's complement integer `a'
1200
| to the single-precision floating-point format.  The conversion is performed
1201
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202
*----------------------------------------------------------------------------*/
1203

    
1204
float32 int64_to_float32( int64 a STATUS_PARAM )
1205
{
1206
    flag zSign;
1207
    uint64 absA;
1208
    int8 shiftCount;
1209

    
1210
    if ( a == 0 ) return float32_zero;
1211
    zSign = ( a < 0 );
1212
    absA = zSign ? - a : a;
1213
    shiftCount = countLeadingZeros64( absA ) - 40;
1214
    if ( 0 <= shiftCount ) {
1215
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1216
    }
1217
    else {
1218
        shiftCount += 7;
1219
        if ( shiftCount < 0 ) {
1220
            shift64RightJamming( absA, - shiftCount, &absA );
1221
        }
1222
        else {
1223
            absA <<= shiftCount;
1224
        }
1225
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1226
    }
1227

    
1228
}
1229

    
1230
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1231
{
1232
    int8 shiftCount;
1233

    
1234
    if ( a == 0 ) return float32_zero;
1235
    shiftCount = countLeadingZeros64( a ) - 40;
1236
    if ( 0 <= shiftCount ) {
1237
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1238
    }
1239
    else {
1240
        shiftCount += 7;
1241
        if ( shiftCount < 0 ) {
1242
            shift64RightJamming( a, - shiftCount, &a );
1243
        }
1244
        else {
1245
            a <<= shiftCount;
1246
        }
1247
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1248
    }
1249
}
1250

    
1251
/*----------------------------------------------------------------------------
1252
| Returns the result of converting the 64-bit two's complement integer `a'
1253
| to the double-precision floating-point format.  The conversion is performed
1254
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255
*----------------------------------------------------------------------------*/
1256

    
1257
float64 int64_to_float64( int64 a STATUS_PARAM )
1258
{
1259
    flag zSign;
1260

    
1261
    if ( a == 0 ) return float64_zero;
1262
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263
        return packFloat64( 1, 0x43E, 0 );
1264
    }
1265
    zSign = ( a < 0 );
1266
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1267

    
1268
}
1269

    
1270
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1271
{
1272
    if ( a == 0 ) return float64_zero;
1273
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1274

    
1275
}
1276

    
1277
#ifdef FLOATX80
1278

    
1279
/*----------------------------------------------------------------------------
1280
| Returns the result of converting the 64-bit two's complement integer `a'
1281
| to the extended double-precision floating-point format.  The conversion
1282
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283
| Arithmetic.
1284
*----------------------------------------------------------------------------*/
1285

    
1286
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1287
{
1288
    flag zSign;
1289
    uint64 absA;
1290
    int8 shiftCount;
1291

    
1292
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293
    zSign = ( a < 0 );
1294
    absA = zSign ? - a : a;
1295
    shiftCount = countLeadingZeros64( absA );
1296
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1297

    
1298
}
1299

    
1300
#endif
1301

    
1302
#ifdef FLOAT128
1303

    
1304
/*----------------------------------------------------------------------------
1305
| Returns the result of converting the 64-bit two's complement integer `a' to
1306
| the quadruple-precision floating-point format.  The conversion is performed
1307
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308
*----------------------------------------------------------------------------*/
1309

    
1310
float128 int64_to_float128( int64 a STATUS_PARAM )
1311
{
1312
    flag zSign;
1313
    uint64 absA;
1314
    int8 shiftCount;
1315
    int32 zExp;
1316
    bits64 zSig0, zSig1;
1317

    
1318
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319
    zSign = ( a < 0 );
1320
    absA = zSign ? - a : a;
1321
    shiftCount = countLeadingZeros64( absA ) + 49;
1322
    zExp = 0x406E - shiftCount;
1323
    if ( 64 <= shiftCount ) {
1324
        zSig1 = 0;
1325
        zSig0 = absA;
1326
        shiftCount -= 64;
1327
    }
1328
    else {
1329
        zSig1 = absA;
1330
        zSig0 = 0;
1331
    }
1332
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1334

    
1335
}
1336

    
1337
#endif
1338

    
1339
/*----------------------------------------------------------------------------
1340
| Returns the result of converting the single-precision floating-point value
1341
| `a' to the 32-bit two's complement integer format.  The conversion is
1342
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1343
| Arithmetic---which means in particular that the conversion is rounded
1344
| according to the current rounding mode.  If `a' is a NaN, the largest
1345
| positive integer is returned.  Otherwise, if the conversion overflows, the
1346
| largest integer with the same sign as `a' is returned.
1347
*----------------------------------------------------------------------------*/
1348

    
1349
int32 float32_to_int32( float32 a STATUS_PARAM )
1350
{
1351
    flag aSign;
1352
    int16 aExp, shiftCount;
1353
    bits32 aSig;
1354
    bits64 aSig64;
1355

    
1356
    a = float32_squash_input_denormal(a STATUS_VAR);
1357
    aSig = extractFloat32Frac( a );
1358
    aExp = extractFloat32Exp( a );
1359
    aSign = extractFloat32Sign( a );
1360
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361
    if ( aExp ) aSig |= 0x00800000;
1362
    shiftCount = 0xAF - aExp;
1363
    aSig64 = aSig;
1364
    aSig64 <<= 32;
1365
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1367

    
1368
}
1369

    
1370
/*----------------------------------------------------------------------------
1371
| Returns the result of converting the single-precision floating-point value
1372
| `a' to the 32-bit two's complement integer format.  The conversion is
1373
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1374
| Arithmetic, except that the conversion is always rounded toward zero.
1375
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1376
| the conversion overflows, the largest integer with the same sign as `a' is
1377
| returned.
1378
*----------------------------------------------------------------------------*/
1379

    
1380
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1381
{
1382
    flag aSign;
1383
    int16 aExp, shiftCount;
1384
    bits32 aSig;
1385
    int32 z;
1386
    a = float32_squash_input_denormal(a STATUS_VAR);
1387

    
1388
    aSig = extractFloat32Frac( a );
1389
    aExp = extractFloat32Exp( a );
1390
    aSign = extractFloat32Sign( a );
1391
    shiftCount = aExp - 0x9E;
1392
    if ( 0 <= shiftCount ) {
1393
        if ( float32_val(a) != 0xCF000000 ) {
1394
            float_raise( float_flag_invalid STATUS_VAR);
1395
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396
        }
1397
        return (sbits32) 0x80000000;
1398
    }
1399
    else if ( aExp <= 0x7E ) {
1400
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401
        return 0;
1402
    }
1403
    aSig = ( aSig | 0x00800000 )<<8;
1404
    z = aSig>>( - shiftCount );
1405
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406
        STATUS(float_exception_flags) |= float_flag_inexact;
1407
    }
1408
    if ( aSign ) z = - z;
1409
    return z;
1410

    
1411
}
1412

    
1413
/*----------------------------------------------------------------------------
1414
| Returns the result of converting the single-precision floating-point value
1415
| `a' to the 16-bit two's complement integer format.  The conversion is
1416
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1417
| Arithmetic, except that the conversion is always rounded toward zero.
1418
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1419
| the conversion overflows, the largest integer with the same sign as `a' is
1420
| returned.
1421
*----------------------------------------------------------------------------*/
1422

    
1423
int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1424
{
1425
    flag aSign;
1426
    int16 aExp, shiftCount;
1427
    bits32 aSig;
1428
    int32 z;
1429

    
1430
    aSig = extractFloat32Frac( a );
1431
    aExp = extractFloat32Exp( a );
1432
    aSign = extractFloat32Sign( a );
1433
    shiftCount = aExp - 0x8E;
1434
    if ( 0 <= shiftCount ) {
1435
        if ( float32_val(a) != 0xC7000000 ) {
1436
            float_raise( float_flag_invalid STATUS_VAR);
1437
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438
                return 0x7FFF;
1439
            }
1440
        }
1441
        return (sbits32) 0xffff8000;
1442
    }
1443
    else if ( aExp <= 0x7E ) {
1444
        if ( aExp | aSig ) {
1445
            STATUS(float_exception_flags) |= float_flag_inexact;
1446
        }
1447
        return 0;
1448
    }
1449
    shiftCount -= 0x10;
1450
    aSig = ( aSig | 0x00800000 )<<8;
1451
    z = aSig>>( - shiftCount );
1452
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453
        STATUS(float_exception_flags) |= float_flag_inexact;
1454
    }
1455
    if ( aSign ) {
1456
        z = - z;
1457
    }
1458
    return z;
1459

    
1460
}
1461

    
1462
/*----------------------------------------------------------------------------
1463
| Returns the result of converting the single-precision floating-point value
1464
| `a' to the 64-bit two's complement integer format.  The conversion is
1465
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1466
| Arithmetic---which means in particular that the conversion is rounded
1467
| according to the current rounding mode.  If `a' is a NaN, the largest
1468
| positive integer is returned.  Otherwise, if the conversion overflows, the
1469
| largest integer with the same sign as `a' is returned.
1470
*----------------------------------------------------------------------------*/
1471

    
1472
int64 float32_to_int64( float32 a STATUS_PARAM )
1473
{
1474
    flag aSign;
1475
    int16 aExp, shiftCount;
1476
    bits32 aSig;
1477
    bits64 aSig64, aSigExtra;
1478
    a = float32_squash_input_denormal(a STATUS_VAR);
1479

    
1480
    aSig = extractFloat32Frac( a );
1481
    aExp = extractFloat32Exp( a );
1482
    aSign = extractFloat32Sign( a );
1483
    shiftCount = 0xBE - aExp;
1484
    if ( shiftCount < 0 ) {
1485
        float_raise( float_flag_invalid STATUS_VAR);
1486
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1488
        }
1489
        return (sbits64) LIT64( 0x8000000000000000 );
1490
    }
1491
    if ( aExp ) aSig |= 0x00800000;
1492
    aSig64 = aSig;
1493
    aSig64 <<= 40;
1494
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1496

    
1497
}
1498

    
1499
/*----------------------------------------------------------------------------
1500
| Returns the result of converting the single-precision floating-point value
1501
| `a' to the 64-bit two's complement integer format.  The conversion is
1502
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1503
| Arithmetic, except that the conversion is always rounded toward zero.  If
1504
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1505
| conversion overflows, the largest integer with the same sign as `a' is
1506
| returned.
1507
*----------------------------------------------------------------------------*/
1508

    
1509
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1510
{
1511
    flag aSign;
1512
    int16 aExp, shiftCount;
1513
    bits32 aSig;
1514
    bits64 aSig64;
1515
    int64 z;
1516
    a = float32_squash_input_denormal(a STATUS_VAR);
1517

    
1518
    aSig = extractFloat32Frac( a );
1519
    aExp = extractFloat32Exp( a );
1520
    aSign = extractFloat32Sign( a );
1521
    shiftCount = aExp - 0xBE;
1522
    if ( 0 <= shiftCount ) {
1523
        if ( float32_val(a) != 0xDF000000 ) {
1524
            float_raise( float_flag_invalid STATUS_VAR);
1525
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1527
            }
1528
        }
1529
        return (sbits64) LIT64( 0x8000000000000000 );
1530
    }
1531
    else if ( aExp <= 0x7E ) {
1532
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533
        return 0;
1534
    }
1535
    aSig64 = aSig | 0x00800000;
1536
    aSig64 <<= 40;
1537
    z = aSig64>>( - shiftCount );
1538
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539
        STATUS(float_exception_flags) |= float_flag_inexact;
1540
    }
1541
    if ( aSign ) z = - z;
1542
    return z;
1543

    
1544
}
1545

    
1546
/*----------------------------------------------------------------------------
1547
| Returns the result of converting the single-precision floating-point value
1548
| `a' to the double-precision floating-point format.  The conversion is
1549
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1550
| Arithmetic.
1551
*----------------------------------------------------------------------------*/
1552

    
1553
float64 float32_to_float64( float32 a STATUS_PARAM )
1554
{
1555
    flag aSign;
1556
    int16 aExp;
1557
    bits32 aSig;
1558
    a = float32_squash_input_denormal(a STATUS_VAR);
1559

    
1560
    aSig = extractFloat32Frac( a );
1561
    aExp = extractFloat32Exp( a );
1562
    aSign = extractFloat32Sign( a );
1563
    if ( aExp == 0xFF ) {
1564
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1565
        return packFloat64( aSign, 0x7FF, 0 );
1566
    }
1567
    if ( aExp == 0 ) {
1568
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570
        --aExp;
1571
    }
1572
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1573

    
1574
}
1575

    
1576
#ifdef FLOATX80
1577

    
1578
/*----------------------------------------------------------------------------
1579
| Returns the result of converting the single-precision floating-point value
1580
| `a' to the extended double-precision floating-point format.  The conversion
1581
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582
| Arithmetic.
1583
*----------------------------------------------------------------------------*/
1584

    
1585
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1586
{
1587
    flag aSign;
1588
    int16 aExp;
1589
    bits32 aSig;
1590

    
1591
    a = float32_squash_input_denormal(a STATUS_VAR);
1592
    aSig = extractFloat32Frac( a );
1593
    aExp = extractFloat32Exp( a );
1594
    aSign = extractFloat32Sign( a );
1595
    if ( aExp == 0xFF ) {
1596
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1597
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1598
    }
1599
    if ( aExp == 0 ) {
1600
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1602
    }
1603
    aSig |= 0x00800000;
1604
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1605

    
1606
}
1607

    
1608
#endif
1609

    
1610
#ifdef FLOAT128
1611

    
1612
/*----------------------------------------------------------------------------
1613
| Returns the result of converting the single-precision floating-point value
1614
| `a' to the double-precision floating-point format.  The conversion is
1615
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1616
| Arithmetic.
1617
*----------------------------------------------------------------------------*/
1618

    
1619
float128 float32_to_float128( float32 a STATUS_PARAM )
1620
{
1621
    flag aSign;
1622
    int16 aExp;
1623
    bits32 aSig;
1624

    
1625
    a = float32_squash_input_denormal(a STATUS_VAR);
1626
    aSig = extractFloat32Frac( a );
1627
    aExp = extractFloat32Exp( a );
1628
    aSign = extractFloat32Sign( a );
1629
    if ( aExp == 0xFF ) {
1630
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1631
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1632
    }
1633
    if ( aExp == 0 ) {
1634
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636
        --aExp;
1637
    }
1638
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1639

    
1640
}
1641

    
1642
#endif
1643

    
1644
/*----------------------------------------------------------------------------
1645
| Rounds the single-precision floating-point value `a' to an integer, and
1646
| returns the result as a single-precision floating-point value.  The
1647
| operation is performed according to the IEC/IEEE Standard for Binary
1648
| Floating-Point Arithmetic.
1649
*----------------------------------------------------------------------------*/
1650

    
1651
float32 float32_round_to_int( float32 a STATUS_PARAM)
1652
{
1653
    flag aSign;
1654
    int16 aExp;
1655
    bits32 lastBitMask, roundBitsMask;
1656
    int8 roundingMode;
1657
    bits32 z;
1658
    a = float32_squash_input_denormal(a STATUS_VAR);
1659

    
1660
    aExp = extractFloat32Exp( a );
1661
    if ( 0x96 <= aExp ) {
1662
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663
            return propagateFloat32NaN( a, a STATUS_VAR );
1664
        }
1665
        return a;
1666
    }
1667
    if ( aExp <= 0x7E ) {
1668
        if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669
        STATUS(float_exception_flags) |= float_flag_inexact;
1670
        aSign = extractFloat32Sign( a );
1671
        switch ( STATUS(float_rounding_mode) ) {
1672
         case float_round_nearest_even:
1673
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674
                return packFloat32( aSign, 0x7F, 0 );
1675
            }
1676
            break;
1677
         case float_round_down:
1678
            return make_float32(aSign ? 0xBF800000 : 0);
1679
         case float_round_up:
1680
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1681
        }
1682
        return packFloat32( aSign, 0, 0 );
1683
    }
1684
    lastBitMask = 1;
1685
    lastBitMask <<= 0x96 - aExp;
1686
    roundBitsMask = lastBitMask - 1;
1687
    z = float32_val(a);
1688
    roundingMode = STATUS(float_rounding_mode);
1689
    if ( roundingMode == float_round_nearest_even ) {
1690
        z += lastBitMask>>1;
1691
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1692
    }
1693
    else if ( roundingMode != float_round_to_zero ) {
1694
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695
            z += roundBitsMask;
1696
        }
1697
    }
1698
    z &= ~ roundBitsMask;
1699
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700
    return make_float32(z);
1701

    
1702
}
1703

    
1704
/*----------------------------------------------------------------------------
1705
| Returns the result of adding the absolute values of the single-precision
1706
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1707
| before being returned.  `zSign' is ignored if the result is a NaN.
1708
| The addition is performed according to the IEC/IEEE Standard for Binary
1709
| Floating-Point Arithmetic.
1710
*----------------------------------------------------------------------------*/
1711

    
1712
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1713
{
1714
    int16 aExp, bExp, zExp;
1715
    bits32 aSig, bSig, zSig;
1716
    int16 expDiff;
1717

    
1718
    aSig = extractFloat32Frac( a );
1719
    aExp = extractFloat32Exp( a );
1720
    bSig = extractFloat32Frac( b );
1721
    bExp = extractFloat32Exp( b );
1722
    expDiff = aExp - bExp;
1723
    aSig <<= 6;
1724
    bSig <<= 6;
1725
    if ( 0 < expDiff ) {
1726
        if ( aExp == 0xFF ) {
1727
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728
            return a;
1729
        }
1730
        if ( bExp == 0 ) {
1731
            --expDiff;
1732
        }
1733
        else {
1734
            bSig |= 0x20000000;
1735
        }
1736
        shift32RightJamming( bSig, expDiff, &bSig );
1737
        zExp = aExp;
1738
    }
1739
    else if ( expDiff < 0 ) {
1740
        if ( bExp == 0xFF ) {
1741
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742
            return packFloat32( zSign, 0xFF, 0 );
1743
        }
1744
        if ( aExp == 0 ) {
1745
            ++expDiff;
1746
        }
1747
        else {
1748
            aSig |= 0x20000000;
1749
        }
1750
        shift32RightJamming( aSig, - expDiff, &aSig );
1751
        zExp = bExp;
1752
    }
1753
    else {
1754
        if ( aExp == 0xFF ) {
1755
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756
            return a;
1757
        }
1758
        if ( aExp == 0 ) {
1759
            if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1761
        }
1762
        zSig = 0x40000000 + aSig + bSig;
1763
        zExp = aExp;
1764
        goto roundAndPack;
1765
    }
1766
    aSig |= 0x20000000;
1767
    zSig = ( aSig + bSig )<<1;
1768
    --zExp;
1769
    if ( (sbits32) zSig < 0 ) {
1770
        zSig = aSig + bSig;
1771
        ++zExp;
1772
    }
1773
 roundAndPack:
1774
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1775

    
1776
}
1777

    
1778
/*----------------------------------------------------------------------------
1779
| Returns the result of subtracting the absolute values of the single-
1780
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1781
| difference is negated before being returned.  `zSign' is ignored if the
1782
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1783
| Standard for Binary Floating-Point Arithmetic.
1784
*----------------------------------------------------------------------------*/
1785

    
1786
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1787
{
1788
    int16 aExp, bExp, zExp;
1789
    bits32 aSig, bSig, zSig;
1790
    int16 expDiff;
1791

    
1792
    aSig = extractFloat32Frac( a );
1793
    aExp = extractFloat32Exp( a );
1794
    bSig = extractFloat32Frac( b );
1795
    bExp = extractFloat32Exp( b );
1796
    expDiff = aExp - bExp;
1797
    aSig <<= 7;
1798
    bSig <<= 7;
1799
    if ( 0 < expDiff ) goto aExpBigger;
1800
    if ( expDiff < 0 ) goto bExpBigger;
1801
    if ( aExp == 0xFF ) {
1802
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803
        float_raise( float_flag_invalid STATUS_VAR);
1804
        return float32_default_nan;
1805
    }
1806
    if ( aExp == 0 ) {
1807
        aExp = 1;
1808
        bExp = 1;
1809
    }
1810
    if ( bSig < aSig ) goto aBigger;
1811
    if ( aSig < bSig ) goto bBigger;
1812
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813
 bExpBigger:
1814
    if ( bExp == 0xFF ) {
1815
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1817
    }
1818
    if ( aExp == 0 ) {
1819
        ++expDiff;
1820
    }
1821
    else {
1822
        aSig |= 0x40000000;
1823
    }
1824
    shift32RightJamming( aSig, - expDiff, &aSig );
1825
    bSig |= 0x40000000;
1826
 bBigger:
1827
    zSig = bSig - aSig;
1828
    zExp = bExp;
1829
    zSign ^= 1;
1830
    goto normalizeRoundAndPack;
1831
 aExpBigger:
1832
    if ( aExp == 0xFF ) {
1833
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834
        return a;
1835
    }
1836
    if ( bExp == 0 ) {
1837
        --expDiff;
1838
    }
1839
    else {
1840
        bSig |= 0x40000000;
1841
    }
1842
    shift32RightJamming( bSig, expDiff, &bSig );
1843
    aSig |= 0x40000000;
1844
 aBigger:
1845
    zSig = aSig - bSig;
1846
    zExp = aExp;
1847
 normalizeRoundAndPack:
1848
    --zExp;
1849
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1850

    
1851
}
1852

    
1853
/*----------------------------------------------------------------------------
1854
| Returns the result of adding the single-precision floating-point values `a'
1855
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1856
| Binary Floating-Point Arithmetic.
1857
*----------------------------------------------------------------------------*/
1858

    
1859
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1860
{
1861
    flag aSign, bSign;
1862
    a = float32_squash_input_denormal(a STATUS_VAR);
1863
    b = float32_squash_input_denormal(b STATUS_VAR);
1864

    
1865
    aSign = extractFloat32Sign( a );
1866
    bSign = extractFloat32Sign( b );
1867
    if ( aSign == bSign ) {
1868
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1869
    }
1870
    else {
1871
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1872
    }
1873

    
1874
}
1875

    
1876
/*----------------------------------------------------------------------------
1877
| Returns the result of subtracting the single-precision floating-point values
1878
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1879
| for Binary Floating-Point Arithmetic.
1880
*----------------------------------------------------------------------------*/
1881

    
1882
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1883
{
1884
    flag aSign, bSign;
1885
    a = float32_squash_input_denormal(a STATUS_VAR);
1886
    b = float32_squash_input_denormal(b STATUS_VAR);
1887

    
1888
    aSign = extractFloat32Sign( a );
1889
    bSign = extractFloat32Sign( b );
1890
    if ( aSign == bSign ) {
1891
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1892
    }
1893
    else {
1894
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1895
    }
1896

    
1897
}
1898

    
1899
/*----------------------------------------------------------------------------
1900
| Returns the result of multiplying the single-precision floating-point values
1901
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1902
| for Binary Floating-Point Arithmetic.
1903
*----------------------------------------------------------------------------*/
1904

    
1905
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1906
{
1907
    flag aSign, bSign, zSign;
1908
    int16 aExp, bExp, zExp;
1909
    bits32 aSig, bSig;
1910
    bits64 zSig64;
1911
    bits32 zSig;
1912

    
1913
    a = float32_squash_input_denormal(a STATUS_VAR);
1914
    b = float32_squash_input_denormal(b STATUS_VAR);
1915

    
1916
    aSig = extractFloat32Frac( a );
1917
    aExp = extractFloat32Exp( a );
1918
    aSign = extractFloat32Sign( a );
1919
    bSig = extractFloat32Frac( b );
1920
    bExp = extractFloat32Exp( b );
1921
    bSign = extractFloat32Sign( b );
1922
    zSign = aSign ^ bSign;
1923
    if ( aExp == 0xFF ) {
1924
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925
            return propagateFloat32NaN( a, b STATUS_VAR );
1926
        }
1927
        if ( ( bExp | bSig ) == 0 ) {
1928
            float_raise( float_flag_invalid STATUS_VAR);
1929
            return float32_default_nan;
1930
        }
1931
        return packFloat32( zSign, 0xFF, 0 );
1932
    }
1933
    if ( bExp == 0xFF ) {
1934
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935
        if ( ( aExp | aSig ) == 0 ) {
1936
            float_raise( float_flag_invalid STATUS_VAR);
1937
            return float32_default_nan;
1938
        }
1939
        return packFloat32( zSign, 0xFF, 0 );
1940
    }
1941
    if ( aExp == 0 ) {
1942
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1944
    }
1945
    if ( bExp == 0 ) {
1946
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1948
    }
1949
    zExp = aExp + bExp - 0x7F;
1950
    aSig = ( aSig | 0x00800000 )<<7;
1951
    bSig = ( bSig | 0x00800000 )<<8;
1952
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953
    zSig = zSig64;
1954
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955
        zSig <<= 1;
1956
        --zExp;
1957
    }
1958
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1959

    
1960
}
1961

    
1962
/*----------------------------------------------------------------------------
1963
| Returns the result of dividing the single-precision floating-point value `a'
1964
| by the corresponding value `b'.  The operation is performed according to the
1965
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966
*----------------------------------------------------------------------------*/
1967

    
1968
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1969
{
1970
    flag aSign, bSign, zSign;
1971
    int16 aExp, bExp, zExp;
1972
    bits32 aSig, bSig, zSig;
1973
    a = float32_squash_input_denormal(a STATUS_VAR);
1974
    b = float32_squash_input_denormal(b STATUS_VAR);
1975

    
1976
    aSig = extractFloat32Frac( a );
1977
    aExp = extractFloat32Exp( a );
1978
    aSign = extractFloat32Sign( a );
1979
    bSig = extractFloat32Frac( b );
1980
    bExp = extractFloat32Exp( b );
1981
    bSign = extractFloat32Sign( b );
1982
    zSign = aSign ^ bSign;
1983
    if ( aExp == 0xFF ) {
1984
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985
        if ( bExp == 0xFF ) {
1986
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987
            float_raise( float_flag_invalid STATUS_VAR);
1988
            return float32_default_nan;
1989
        }
1990
        return packFloat32( zSign, 0xFF, 0 );
1991
    }
1992
    if ( bExp == 0xFF ) {
1993
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994
        return packFloat32( zSign, 0, 0 );
1995
    }
1996
    if ( bExp == 0 ) {
1997
        if ( bSig == 0 ) {
1998
            if ( ( aExp | aSig ) == 0 ) {
1999
                float_raise( float_flag_invalid STATUS_VAR);
2000
                return float32_default_nan;
2001
            }
2002
            float_raise( float_flag_divbyzero STATUS_VAR);
2003
            return packFloat32( zSign, 0xFF, 0 );
2004
        }
2005
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006
    }
2007
    if ( aExp == 0 ) {
2008
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2010
    }
2011
    zExp = aExp - bExp + 0x7D;
2012
    aSig = ( aSig | 0x00800000 )<<7;
2013
    bSig = ( bSig | 0x00800000 )<<8;
2014
    if ( bSig <= ( aSig + aSig ) ) {
2015
        aSig >>= 1;
2016
        ++zExp;
2017
    }
2018
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019
    if ( ( zSig & 0x3F ) == 0 ) {
2020
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2021
    }
2022
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2023

    
2024
}
2025

    
2026
/*----------------------------------------------------------------------------
2027
| Returns the remainder of the single-precision floating-point value `a'
2028
| with respect to the corresponding value `b'.  The operation is performed
2029
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030
*----------------------------------------------------------------------------*/
2031

    
2032
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2033
{
2034
    flag aSign, zSign;
2035
    int16 aExp, bExp, expDiff;
2036
    bits32 aSig, bSig;
2037
    bits32 q;
2038
    bits64 aSig64, bSig64, q64;
2039
    bits32 alternateASig;
2040
    sbits32 sigMean;
2041
    a = float32_squash_input_denormal(a STATUS_VAR);
2042
    b = float32_squash_input_denormal(b STATUS_VAR);
2043

    
2044
    aSig = extractFloat32Frac( a );
2045
    aExp = extractFloat32Exp( a );
2046
    aSign = extractFloat32Sign( a );
2047
    bSig = extractFloat32Frac( b );
2048
    bExp = extractFloat32Exp( b );
2049
    if ( aExp == 0xFF ) {
2050
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051
            return propagateFloat32NaN( a, b STATUS_VAR );
2052
        }
2053
        float_raise( float_flag_invalid STATUS_VAR);
2054
        return float32_default_nan;
2055
    }
2056
    if ( bExp == 0xFF ) {
2057
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058
        return a;
2059
    }
2060
    if ( bExp == 0 ) {
2061
        if ( bSig == 0 ) {
2062
            float_raise( float_flag_invalid STATUS_VAR);
2063
            return float32_default_nan;
2064
        }
2065
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2066
    }
2067
    if ( aExp == 0 ) {
2068
        if ( aSig == 0 ) return a;
2069
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2070
    }
2071
    expDiff = aExp - bExp;
2072
    aSig |= 0x00800000;
2073
    bSig |= 0x00800000;
2074
    if ( expDiff < 32 ) {
2075
        aSig <<= 8;
2076
        bSig <<= 8;
2077
        if ( expDiff < 0 ) {
2078
            if ( expDiff < -1 ) return a;
2079
            aSig >>= 1;
2080
        }
2081
        q = ( bSig <= aSig );
2082
        if ( q ) aSig -= bSig;
2083
        if ( 0 < expDiff ) {
2084
            q = ( ( (bits64) aSig )<<32 ) / bSig;
2085
            q >>= 32 - expDiff;
2086
            bSig >>= 2;
2087
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2088
        }
2089
        else {
2090
            aSig >>= 2;
2091
            bSig >>= 2;
2092
        }
2093
    }
2094
    else {
2095
        if ( bSig <= aSig ) aSig -= bSig;
2096
        aSig64 = ( (bits64) aSig )<<40;
2097
        bSig64 = ( (bits64) bSig )<<40;
2098
        expDiff -= 64;
2099
        while ( 0 < expDiff ) {
2100
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102
            aSig64 = - ( ( bSig * q64 )<<38 );
2103
            expDiff -= 62;
2104
        }
2105
        expDiff += 64;
2106
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108
        q = q64>>( 64 - expDiff );
2109
        bSig <<= 6;
2110
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2111
    }
2112
    do {
2113
        alternateASig = aSig;
2114
        ++q;
2115
        aSig -= bSig;
2116
    } while ( 0 <= (sbits32) aSig );
2117
    sigMean = aSig + alternateASig;
2118
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119
        aSig = alternateASig;
2120
    }
2121
    zSign = ( (sbits32) aSig < 0 );
2122
    if ( zSign ) aSig = - aSig;
2123
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2124

    
2125
}
2126

    
2127
/*----------------------------------------------------------------------------
2128
| Returns the square root of the single-precision floating-point value `a'.
2129
| The operation is performed according to the IEC/IEEE Standard for Binary
2130
| Floating-Point Arithmetic.
2131
*----------------------------------------------------------------------------*/
2132

    
2133
float32 float32_sqrt( float32 a STATUS_PARAM )
2134
{
2135
    flag aSign;
2136
    int16 aExp, zExp;
2137
    bits32 aSig, zSig;
2138
    bits64 rem, term;
2139
    a = float32_squash_input_denormal(a STATUS_VAR);
2140

    
2141
    aSig = extractFloat32Frac( a );
2142
    aExp = extractFloat32Exp( a );
2143
    aSign = extractFloat32Sign( a );
2144
    if ( aExp == 0xFF ) {
2145
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146
        if ( ! aSign ) return a;
2147
        float_raise( float_flag_invalid STATUS_VAR);
2148
        return float32_default_nan;
2149
    }
2150
    if ( aSign ) {
2151
        if ( ( aExp | aSig ) == 0 ) return a;
2152
        float_raise( float_flag_invalid STATUS_VAR);
2153
        return float32_default_nan;
2154
    }
2155
    if ( aExp == 0 ) {
2156
        if ( aSig == 0 ) return float32_zero;
2157
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2158
    }
2159
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160
    aSig = ( aSig | 0x00800000 )<<8;
2161
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2162
    if ( ( zSig & 0x7F ) <= 5 ) {
2163
        if ( zSig < 2 ) {
2164
            zSig = 0x7FFFFFFF;
2165
            goto roundAndPack;
2166
        }
2167
        aSig >>= aExp & 1;
2168
        term = ( (bits64) zSig ) * zSig;
2169
        rem = ( ( (bits64) aSig )<<32 ) - term;
2170
        while ( (sbits64) rem < 0 ) {
2171
            --zSig;
2172
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2173
        }
2174
        zSig |= ( rem != 0 );
2175
    }
2176
    shift32RightJamming( zSig, 1, &zSig );
2177
 roundAndPack:
2178
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2179

    
2180
}
2181

    
2182
/*----------------------------------------------------------------------------
2183
| Returns the binary exponential of the single-precision floating-point value
2184
| `a'. The operation is performed according to the IEC/IEEE Standard for
2185
| Binary Floating-Point Arithmetic.
2186
|
2187
| Uses the following identities:
2188
|
2189
| 1. -------------------------------------------------------------------------
2190
|      x    x*ln(2)
2191
|     2  = e
2192
|
2193
| 2. -------------------------------------------------------------------------
2194
|                      2     3     4     5           n
2195
|      x        x     x     x     x     x           x
2196
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197
|               1!    2!    3!    4!    5!          n!
2198
*----------------------------------------------------------------------------*/
2199

    
2200
static const float64 float32_exp2_coefficients[15] =
2201
{
2202
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2203
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2204
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2205
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2206
    const_float64( 0x3f81111111111111ll ), /*  5 */
2207
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2208
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2209
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2210
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2211
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2215
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2217
};
2218

    
2219
float32 float32_exp2( float32 a STATUS_PARAM )
2220
{
2221
    flag aSign;
2222
    int16 aExp;
2223
    bits32 aSig;
2224
    float64 r, x, xn;
2225
    int i;
2226
    a = float32_squash_input_denormal(a STATUS_VAR);
2227

    
2228
    aSig = extractFloat32Frac( a );
2229
    aExp = extractFloat32Exp( a );
2230
    aSign = extractFloat32Sign( a );
2231

    
2232
    if ( aExp == 0xFF) {
2233
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234
        return (aSign) ? float32_zero : a;
2235
    }
2236
    if (aExp == 0) {
2237
        if (aSig == 0) return float32_one;
2238
    }
2239

    
2240
    float_raise( float_flag_inexact STATUS_VAR);
2241

    
2242
    /* ******************************* */
2243
    /* using float64 for approximation */
2244
    /* ******************************* */
2245
    x = float32_to_float64(a STATUS_VAR);
2246
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2247

    
2248
    xn = x;
2249
    r = float64_one;
2250
    for (i = 0 ; i < 15 ; i++) {
2251
        float64 f;
2252

    
2253
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254
        r = float64_add(r, f STATUS_VAR);
2255

    
2256
        xn = float64_mul(xn, x STATUS_VAR);
2257
    }
2258

    
2259
    return float64_to_float32(r, status);
2260
}
2261

    
2262
/*----------------------------------------------------------------------------
2263
| Returns the binary log of the single-precision floating-point value `a'.
2264
| The operation is performed according to the IEC/IEEE Standard for Binary
2265
| Floating-Point Arithmetic.
2266
*----------------------------------------------------------------------------*/
2267
float32 float32_log2( float32 a STATUS_PARAM )
2268
{
2269
    flag aSign, zSign;
2270
    int16 aExp;
2271
    bits32 aSig, zSig, i;
2272

    
2273
    a = float32_squash_input_denormal(a STATUS_VAR);
2274
    aSig = extractFloat32Frac( a );
2275
    aExp = extractFloat32Exp( a );
2276
    aSign = extractFloat32Sign( a );
2277

    
2278
    if ( aExp == 0 ) {
2279
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2281
    }
2282
    if ( aSign ) {
2283
        float_raise( float_flag_invalid STATUS_VAR);
2284
        return float32_default_nan;
2285
    }
2286
    if ( aExp == 0xFF ) {
2287
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288
        return a;
2289
    }
2290

    
2291
    aExp -= 0x7F;
2292
    aSig |= 0x00800000;
2293
    zSign = aExp < 0;
2294
    zSig = aExp << 23;
2295

    
2296
    for (i = 1 << 22; i > 0; i >>= 1) {
2297
        aSig = ( (bits64)aSig * aSig ) >> 23;
2298
        if ( aSig & 0x01000000 ) {
2299
            aSig >>= 1;
2300
            zSig |= i;
2301
        }
2302
    }
2303

    
2304
    if ( zSign )
2305
        zSig = -zSig;
2306

    
2307
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2308
}
2309

    
2310
/*----------------------------------------------------------------------------
2311
| Returns 1 if the single-precision floating-point value `a' is equal to
2312
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2313
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314
*----------------------------------------------------------------------------*/
2315

    
2316
int float32_eq( float32 a, float32 b STATUS_PARAM )
2317
{
2318
    a = float32_squash_input_denormal(a STATUS_VAR);
2319
    b = float32_squash_input_denormal(b STATUS_VAR);
2320

    
2321
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323
       ) {
2324
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325
            float_raise( float_flag_invalid STATUS_VAR);
2326
        }
2327
        return 0;
2328
    }
2329
    return ( float32_val(a) == float32_val(b) ) ||
2330
            ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2331

    
2332
}
2333

    
2334
/*----------------------------------------------------------------------------
2335
| Returns 1 if the single-precision floating-point value `a' is less than
2336
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2337
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338
| Arithmetic.
2339
*----------------------------------------------------------------------------*/
2340

    
2341
int float32_le( float32 a, float32 b STATUS_PARAM )
2342
{
2343
    flag aSign, bSign;
2344
    bits32 av, bv;
2345
    a = float32_squash_input_denormal(a STATUS_VAR);
2346
    b = float32_squash_input_denormal(b STATUS_VAR);
2347

    
2348
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2350
       ) {
2351
        float_raise( float_flag_invalid STATUS_VAR);
2352
        return 0;
2353
    }
2354
    aSign = extractFloat32Sign( a );
2355
    bSign = extractFloat32Sign( b );
2356
    av = float32_val(a);
2357
    bv = float32_val(b);
2358
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2360

    
2361
}
2362

    
2363
/*----------------------------------------------------------------------------
2364
| Returns 1 if the single-precision floating-point value `a' is less than
2365
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2366
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367
*----------------------------------------------------------------------------*/
2368

    
2369
int float32_lt( float32 a, float32 b STATUS_PARAM )
2370
{
2371
    flag aSign, bSign;
2372
    bits32 av, bv;
2373
    a = float32_squash_input_denormal(a STATUS_VAR);
2374
    b = float32_squash_input_denormal(b STATUS_VAR);
2375

    
2376
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2378
       ) {
2379
        float_raise( float_flag_invalid STATUS_VAR);
2380
        return 0;
2381
    }
2382
    aSign = extractFloat32Sign( a );
2383
    bSign = extractFloat32Sign( b );
2384
    av = float32_val(a);
2385
    bv = float32_val(b);
2386
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2388

    
2389
}
2390

    
2391
/*----------------------------------------------------------------------------
2392
| Returns 1 if the single-precision floating-point value `a' is equal to
2393
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2394
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2395
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396
*----------------------------------------------------------------------------*/
2397

    
2398
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2399
{
2400
    bits32 av, bv;
2401
    a = float32_squash_input_denormal(a STATUS_VAR);
2402
    b = float32_squash_input_denormal(b STATUS_VAR);
2403

    
2404
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2406
       ) {
2407
        float_raise( float_flag_invalid STATUS_VAR);
2408
        return 0;
2409
    }
2410
    av = float32_val(a);
2411
    bv = float32_val(b);
2412
    return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2413

    
2414
}
2415

    
2416
/*----------------------------------------------------------------------------
2417
| Returns 1 if the single-precision floating-point value `a' is less than or
2418
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2419
| cause an exception.  Otherwise, the comparison is performed according to the
2420
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421
*----------------------------------------------------------------------------*/
2422

    
2423
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2424
{
2425
    flag aSign, bSign;
2426
    bits32 av, bv;
2427
    a = float32_squash_input_denormal(a STATUS_VAR);
2428
    b = float32_squash_input_denormal(b STATUS_VAR);
2429

    
2430
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2432
       ) {
2433
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434
            float_raise( float_flag_invalid STATUS_VAR);
2435
        }
2436
        return 0;
2437
    }
2438
    aSign = extractFloat32Sign( a );
2439
    bSign = extractFloat32Sign( b );
2440
    av = float32_val(a);
2441
    bv = float32_val(b);
2442
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2444

    
2445
}
2446

    
2447
/*----------------------------------------------------------------------------
2448
| Returns 1 if the single-precision floating-point value `a' is less than
2449
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2450
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2451
| Standard for Binary Floating-Point Arithmetic.
2452
*----------------------------------------------------------------------------*/
2453

    
2454
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2455
{
2456
    flag aSign, bSign;
2457
    bits32 av, bv;
2458
    a = float32_squash_input_denormal(a STATUS_VAR);
2459
    b = float32_squash_input_denormal(b STATUS_VAR);
2460

    
2461
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2463
       ) {
2464
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465
            float_raise( float_flag_invalid STATUS_VAR);
2466
        }
2467
        return 0;
2468
    }
2469
    aSign = extractFloat32Sign( a );
2470
    bSign = extractFloat32Sign( b );
2471
    av = float32_val(a);
2472
    bv = float32_val(b);
2473
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2475

    
2476
}
2477

    
2478
/*----------------------------------------------------------------------------
2479
| Returns the result of converting the double-precision floating-point value
2480
| `a' to the 32-bit two's complement integer format.  The conversion is
2481
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2482
| Arithmetic---which means in particular that the conversion is rounded
2483
| according to the current rounding mode.  If `a' is a NaN, the largest
2484
| positive integer is returned.  Otherwise, if the conversion overflows, the
2485
| largest integer with the same sign as `a' is returned.
2486
*----------------------------------------------------------------------------*/
2487

    
2488
int32 float64_to_int32( float64 a STATUS_PARAM )
2489
{
2490
    flag aSign;
2491
    int16 aExp, shiftCount;
2492
    bits64 aSig;
2493
    a = float64_squash_input_denormal(a STATUS_VAR);
2494

    
2495
    aSig = extractFloat64Frac( a );
2496
    aExp = extractFloat64Exp( a );
2497
    aSign = extractFloat64Sign( a );
2498
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500
    shiftCount = 0x42C - aExp;
2501
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2503

    
2504
}
2505

    
2506
/*----------------------------------------------------------------------------
2507
| Returns the result of converting the double-precision floating-point value
2508
| `a' to the 32-bit two's complement integer format.  The conversion is
2509
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2510
| Arithmetic, except that the conversion is always rounded toward zero.
2511
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2512
| the conversion overflows, the largest integer with the same sign as `a' is
2513
| returned.
2514
*----------------------------------------------------------------------------*/
2515

    
2516
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2517
{
2518
    flag aSign;
2519
    int16 aExp, shiftCount;
2520
    bits64 aSig, savedASig;
2521
    int32 z;
2522
    a = float64_squash_input_denormal(a STATUS_VAR);
2523

    
2524
    aSig = extractFloat64Frac( a );
2525
    aExp = extractFloat64Exp( a );
2526
    aSign = extractFloat64Sign( a );
2527
    if ( 0x41E < aExp ) {
2528
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529
        goto invalid;
2530
    }
2531
    else if ( aExp < 0x3FF ) {
2532
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533
        return 0;
2534
    }
2535
    aSig |= LIT64( 0x0010000000000000 );
2536
    shiftCount = 0x433 - aExp;
2537
    savedASig = aSig;
2538
    aSig >>= shiftCount;
2539
    z = aSig;
2540
    if ( aSign ) z = - z;
2541
    if ( ( z < 0 ) ^ aSign ) {
2542
 invalid:
2543
        float_raise( float_flag_invalid STATUS_VAR);
2544
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2545
    }
2546
    if ( ( aSig<<shiftCount ) != savedASig ) {
2547
        STATUS(float_exception_flags) |= float_flag_inexact;
2548
    }
2549
    return z;
2550

    
2551
}
2552

    
2553
/*----------------------------------------------------------------------------
2554
| Returns the result of converting the double-precision floating-point value
2555
| `a' to the 16-bit two's complement integer format.  The conversion is
2556
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2557
| Arithmetic, except that the conversion is always rounded toward zero.
2558
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2559
| the conversion overflows, the largest integer with the same sign as `a' is
2560
| returned.
2561
*----------------------------------------------------------------------------*/
2562

    
2563
int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2564
{
2565
    flag aSign;
2566
    int16 aExp, shiftCount;
2567
    bits64 aSig, savedASig;
2568
    int32 z;
2569

    
2570
    aSig = extractFloat64Frac( a );
2571
    aExp = extractFloat64Exp( a );
2572
    aSign = extractFloat64Sign( a );
2573
    if ( 0x40E < aExp ) {
2574
        if ( ( aExp == 0x7FF ) && aSig ) {
2575
            aSign = 0;
2576
        }
2577
        goto invalid;
2578
    }
2579
    else if ( aExp < 0x3FF ) {
2580
        if ( aExp || aSig ) {
2581
            STATUS(float_exception_flags) |= float_flag_inexact;
2582
        }
2583
        return 0;
2584
    }
2585
    aSig |= LIT64( 0x0010000000000000 );
2586
    shiftCount = 0x433 - aExp;
2587
    savedASig = aSig;
2588
    aSig >>= shiftCount;
2589
    z = aSig;
2590
    if ( aSign ) {
2591
        z = - z;
2592
    }
2593
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594
 invalid:
2595
        float_raise( float_flag_invalid STATUS_VAR);
2596
        return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2597
    }
2598
    if ( ( aSig<<shiftCount ) != savedASig ) {
2599
        STATUS(float_exception_flags) |= float_flag_inexact;
2600
    }
2601
    return z;
2602
}
2603

    
2604
/*----------------------------------------------------------------------------
2605
| Returns the result of converting the double-precision floating-point value
2606
| `a' to the 64-bit two's complement integer format.  The conversion is
2607
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2608
| Arithmetic---which means in particular that the conversion is rounded
2609
| according to the current rounding mode.  If `a' is a NaN, the largest
2610
| positive integer is returned.  Otherwise, if the conversion overflows, the
2611
| largest integer with the same sign as `a' is returned.
2612
*----------------------------------------------------------------------------*/
2613

    
2614
int64 float64_to_int64( float64 a STATUS_PARAM )
2615
{
2616
    flag aSign;
2617
    int16 aExp, shiftCount;
2618
    bits64 aSig, aSigExtra;
2619
    a = float64_squash_input_denormal(a STATUS_VAR);
2620

    
2621
    aSig = extractFloat64Frac( a );
2622
    aExp = extractFloat64Exp( a );
2623
    aSign = extractFloat64Sign( a );
2624
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625
    shiftCount = 0x433 - aExp;
2626
    if ( shiftCount <= 0 ) {
2627
        if ( 0x43E < aExp ) {
2628
            float_raise( float_flag_invalid STATUS_VAR);
2629
            if (    ! aSign
2630
                 || (    ( aExp == 0x7FF )
2631
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2632
               ) {
2633
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2634
            }
2635
            return (sbits64) LIT64( 0x8000000000000000 );
2636
        }
2637
        aSigExtra = 0;
2638
        aSig <<= - shiftCount;
2639
    }
2640
    else {
2641
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2642
    }
2643
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2644

    
2645
}
2646

    
2647
/*----------------------------------------------------------------------------
2648
| Returns the result of converting the double-precision floating-point value
2649
| `a' to the 64-bit two's complement integer format.  The conversion is
2650
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2651
| Arithmetic, except that the conversion is always rounded toward zero.
2652
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2653
| the conversion overflows, the largest integer with the same sign as `a' is
2654
| returned.
2655
*----------------------------------------------------------------------------*/
2656

    
2657
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2658
{
2659
    flag aSign;
2660
    int16 aExp, shiftCount;
2661
    bits64 aSig;
2662
    int64 z;
2663
    a = float64_squash_input_denormal(a STATUS_VAR);
2664

    
2665
    aSig = extractFloat64Frac( a );
2666
    aExp = extractFloat64Exp( a );
2667
    aSign = extractFloat64Sign( a );
2668
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669
    shiftCount = aExp - 0x433;
2670
    if ( 0 <= shiftCount ) {
2671
        if ( 0x43E <= aExp ) {
2672
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673
                float_raise( float_flag_invalid STATUS_VAR);
2674
                if (    ! aSign
2675
                     || (    ( aExp == 0x7FF )
2676
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2677
                   ) {
2678
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2679
                }
2680
            }
2681
            return (sbits64) LIT64( 0x8000000000000000 );
2682
        }
2683
        z = aSig<<shiftCount;
2684
    }
2685
    else {
2686
        if ( aExp < 0x3FE ) {
2687
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688
            return 0;
2689
        }
2690
        z = aSig>>( - shiftCount );
2691
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692
            STATUS(float_exception_flags) |= float_flag_inexact;
2693
        }
2694
    }
2695
    if ( aSign ) z = - z;
2696
    return z;
2697

    
2698
}
2699

    
2700
/*----------------------------------------------------------------------------
2701
| Returns the result of converting the double-precision floating-point value
2702
| `a' to the single-precision floating-point format.  The conversion is
2703
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2704
| Arithmetic.
2705
*----------------------------------------------------------------------------*/
2706

    
2707
float32 float64_to_float32( float64 a STATUS_PARAM )
2708
{
2709
    flag aSign;
2710
    int16 aExp;
2711
    bits64 aSig;
2712
    bits32 zSig;
2713
    a = float64_squash_input_denormal(a STATUS_VAR);
2714

    
2715
    aSig = extractFloat64Frac( a );
2716
    aExp = extractFloat64Exp( a );
2717
    aSign = extractFloat64Sign( a );
2718
    if ( aExp == 0x7FF ) {
2719
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2720
        return packFloat32( aSign, 0xFF, 0 );
2721
    }
2722
    shift64RightJamming( aSig, 22, &aSig );
2723
    zSig = aSig;
2724
    if ( aExp || zSig ) {
2725
        zSig |= 0x40000000;
2726
        aExp -= 0x381;
2727
    }
2728
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2729

    
2730
}
2731

    
2732

    
2733
/*----------------------------------------------------------------------------
2734
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735
| half-precision floating-point value, returning the result.  After being
2736
| shifted into the proper positions, the three fields are simply added
2737
| together to form the result.  This means that any integer portion of `zSig'
2738
| will be added into the exponent.  Since a properly normalized significand
2739
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2740
| than the desired result exponent whenever `zSig' is a complete, normalized
2741
| significand.
2742
*----------------------------------------------------------------------------*/
2743
static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2744
{
2745
    return make_float16(
2746
        (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2747
}
2748

    
2749
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2751

    
2752
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2753
{
2754
    flag aSign;
2755
    int16 aExp;
2756
    bits32 aSig;
2757

    
2758
    aSign = extractFloat16Sign(a);
2759
    aExp = extractFloat16Exp(a);
2760
    aSig = extractFloat16Frac(a);
2761

    
2762
    if (aExp == 0x1f && ieee) {
2763
        if (aSig) {
2764
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2765
        }
2766
        return packFloat32(aSign, 0xff, aSig << 13);
2767
    }
2768
    if (aExp == 0) {
2769
        int8 shiftCount;
2770

    
2771
        if (aSig == 0) {
2772
            return packFloat32(aSign, 0, 0);
2773
        }
2774

    
2775
        shiftCount = countLeadingZeros32( aSig ) - 21;
2776
        aSig = aSig << shiftCount;
2777
        aExp = -shiftCount;
2778
    }
2779
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2780
}
2781

    
2782
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2783
{
2784
    flag aSign;
2785
    int16 aExp;
2786
    bits32 aSig;
2787
    bits32 mask;
2788
    bits32 increment;
2789
    int8 roundingMode;
2790
    a = float32_squash_input_denormal(a STATUS_VAR);
2791

    
2792
    aSig = extractFloat32Frac( a );
2793
    aExp = extractFloat32Exp( a );
2794
    aSign = extractFloat32Sign( a );
2795
    if ( aExp == 0xFF ) {
2796
        if (aSig) {
2797
            /* Input is a NaN */
2798
            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2799
            if (!ieee) {
2800
                return packFloat16(aSign, 0, 0);
2801
            }
2802
            return r;
2803
        }
2804
        /* Infinity */
2805
        if (!ieee) {
2806
            float_raise(float_flag_invalid STATUS_VAR);
2807
            return packFloat16(aSign, 0x1f, 0x3ff);
2808
        }
2809
        return packFloat16(aSign, 0x1f, 0);
2810
    }
2811
    if (aExp == 0 && aSig == 0) {
2812
        return packFloat16(aSign, 0, 0);
2813
    }
2814
    /* Decimal point between bits 22 and 23.  */
2815
    aSig |= 0x00800000;
2816
    aExp -= 0x7f;
2817
    if (aExp < -14) {
2818
        mask = 0x00ffffff;
2819
        if (aExp >= -24) {
2820
            mask >>= 25 + aExp;
2821
        }
2822
    } else {
2823
        mask = 0x00001fff;
2824
    }
2825
    if (aSig & mask) {
2826
        float_raise( float_flag_underflow STATUS_VAR );
2827
        roundingMode = STATUS(float_rounding_mode);
2828
        switch (roundingMode) {
2829
        case float_round_nearest_even:
2830
            increment = (mask + 1) >> 1;
2831
            if ((aSig & mask) == increment) {
2832
                increment = aSig & (increment << 1);
2833
            }
2834
            break;
2835
        case float_round_up:
2836
            increment = aSign ? 0 : mask;
2837
            break;
2838
        case float_round_down:
2839
            increment = aSign ? mask : 0;
2840
            break;
2841
        default: /* round_to_zero */
2842
            increment = 0;
2843
            break;
2844
        }
2845
        aSig += increment;
2846
        if (aSig >= 0x01000000) {
2847
            aSig >>= 1;
2848
            aExp++;
2849
        }
2850
    } else if (aExp < -14
2851
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2852
        float_raise( float_flag_underflow STATUS_VAR);
2853
    }
2854

    
2855
    if (ieee) {
2856
        if (aExp > 15) {
2857
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2858
            return packFloat16(aSign, 0x1f, 0);
2859
        }
2860
    } else {
2861
        if (aExp > 16) {
2862
            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2863
            return packFloat16(aSign, 0x1f, 0x3ff);
2864
        }
2865
    }
2866
    if (aExp < -24) {
2867
        return packFloat16(aSign, 0, 0);
2868
    }
2869
    if (aExp < -14) {
2870
        aSig >>= -14 - aExp;
2871
        aExp = -14;
2872
    }
2873
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2874
}
2875

    
2876
#ifdef FLOATX80
2877

    
2878
/*----------------------------------------------------------------------------
2879
| Returns the result of converting the double-precision floating-point value
2880
| `a' to the extended double-precision floating-point format.  The conversion
2881
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2882
| Arithmetic.
2883
*----------------------------------------------------------------------------*/
2884

    
2885
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2886
{
2887
    flag aSign;
2888
    int16 aExp;
2889
    bits64 aSig;
2890

    
2891
    a = float64_squash_input_denormal(a STATUS_VAR);
2892
    aSig = extractFloat64Frac( a );
2893
    aExp = extractFloat64Exp( a );
2894
    aSign = extractFloat64Sign( a );
2895
    if ( aExp == 0x7FF ) {
2896
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2897
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2898
    }
2899
    if ( aExp == 0 ) {
2900
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2901
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2902
    }
2903
    return
2904
        packFloatx80(
2905
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2906

    
2907
}
2908

    
2909
#endif
2910

    
2911
#ifdef FLOAT128
2912

    
2913
/*----------------------------------------------------------------------------
2914
| Returns the result of converting the double-precision floating-point value
2915
| `a' to the quadruple-precision floating-point format.  The conversion is
2916
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2917
| Arithmetic.
2918
*----------------------------------------------------------------------------*/
2919

    
2920
float128 float64_to_float128( float64 a STATUS_PARAM )
2921
{
2922
    flag aSign;
2923
    int16 aExp;
2924
    bits64 aSig, zSig0, zSig1;
2925

    
2926
    a = float64_squash_input_denormal(a STATUS_VAR);
2927
    aSig = extractFloat64Frac( a );
2928
    aExp = extractFloat64Exp( a );
2929
    aSign = extractFloat64Sign( a );
2930
    if ( aExp == 0x7FF ) {
2931
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2932
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2933
    }
2934
    if ( aExp == 0 ) {
2935
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2936
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2937
        --aExp;
2938
    }
2939
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2940
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2941

    
2942
}
2943

    
2944
#endif
2945

    
2946
/*----------------------------------------------------------------------------
2947
| Rounds the double-precision floating-point value `a' to an integer, and
2948
| returns the result as a double-precision floating-point value.  The
2949
| operation is performed according to the IEC/IEEE Standard for Binary
2950
| Floating-Point Arithmetic.
2951
*----------------------------------------------------------------------------*/
2952

    
2953
float64 float64_round_to_int( float64 a STATUS_PARAM )
2954
{
2955
    flag aSign;
2956
    int16 aExp;
2957
    bits64 lastBitMask, roundBitsMask;
2958
    int8 roundingMode;
2959
    bits64 z;
2960
    a = float64_squash_input_denormal(a STATUS_VAR);
2961

    
2962
    aExp = extractFloat64Exp( a );
2963
    if ( 0x433 <= aExp ) {
2964
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2965
            return propagateFloat64NaN( a, a STATUS_VAR );
2966
        }
2967
        return a;
2968
    }
2969
    if ( aExp < 0x3FF ) {
2970
        if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2971
        STATUS(float_exception_flags) |= float_flag_inexact;
2972
        aSign = extractFloat64Sign( a );
2973
        switch ( STATUS(float_rounding_mode) ) {
2974
         case float_round_nearest_even:
2975
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2976
                return packFloat64( aSign, 0x3FF, 0 );
2977
            }
2978
            break;
2979
         case float_round_down:
2980
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2981
         case float_round_up:
2982
            return make_float64(
2983
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2984
        }
2985
        return packFloat64( aSign, 0, 0 );
2986
    }
2987
    lastBitMask = 1;
2988
    lastBitMask <<= 0x433 - aExp;
2989
    roundBitsMask = lastBitMask - 1;
2990
    z = float64_val(a);
2991
    roundingMode = STATUS(float_rounding_mode);
2992
    if ( roundingMode == float_round_nearest_even ) {
2993
        z += lastBitMask>>1;
2994
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2995
    }
2996
    else if ( roundingMode != float_round_to_zero ) {
2997
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2998
            z += roundBitsMask;
2999
        }
3000
    }
3001
    z &= ~ roundBitsMask;
3002
    if ( z != float64_val(a) )
3003
        STATUS(float_exception_flags) |= float_flag_inexact;
3004
    return make_float64(z);
3005

    
3006
}
3007

    
3008
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3009
{
3010
    int oldmode;
3011
    float64 res;
3012
    oldmode = STATUS(float_rounding_mode);
3013
    STATUS(float_rounding_mode) = float_round_to_zero;
3014
    res = float64_round_to_int(a STATUS_VAR);
3015
    STATUS(float_rounding_mode) = oldmode;
3016
    return res;
3017
}
3018

    
3019
/*----------------------------------------------------------------------------
3020
| Returns the result of adding the absolute values of the double-precision
3021
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3022
| before being returned.  `zSign' is ignored if the result is a NaN.
3023
| The addition is performed according to the IEC/IEEE Standard for Binary
3024
| Floating-Point Arithmetic.
3025
*----------------------------------------------------------------------------*/
3026

    
3027
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3028
{
3029
    int16 aExp, bExp, zExp;
3030
    bits64 aSig, bSig, zSig;
3031
    int16 expDiff;
3032

    
3033
    aSig = extractFloat64Frac( a );
3034
    aExp = extractFloat64Exp( a );
3035
    bSig = extractFloat64Frac( b );
3036
    bExp = extractFloat64Exp( b );
3037
    expDiff = aExp - bExp;
3038
    aSig <<= 9;
3039
    bSig <<= 9;
3040
    if ( 0 < expDiff ) {
3041
        if ( aExp == 0x7FF ) {
3042
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3043
            return a;
3044
        }
3045
        if ( bExp == 0 ) {
3046
            --expDiff;
3047
        }
3048
        else {
3049
            bSig |= LIT64( 0x2000000000000000 );
3050
        }
3051
        shift64RightJamming( bSig, expDiff, &bSig );
3052
        zExp = aExp;
3053
    }
3054
    else if ( expDiff < 0 ) {
3055
        if ( bExp == 0x7FF ) {
3056
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3057
            return packFloat64( zSign, 0x7FF, 0 );
3058
        }
3059
        if ( aExp == 0 ) {
3060
            ++expDiff;
3061
        }
3062
        else {
3063
            aSig |= LIT64( 0x2000000000000000 );
3064
        }
3065
        shift64RightJamming( aSig, - expDiff, &aSig );
3066
        zExp = bExp;
3067
    }
3068
    else {
3069
        if ( aExp == 0x7FF ) {
3070
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3071
            return a;
3072
        }
3073
        if ( aExp == 0 ) {
3074
            if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3075
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3076
        }
3077
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3078
        zExp = aExp;
3079
        goto roundAndPack;
3080
    }
3081
    aSig |= LIT64( 0x2000000000000000 );
3082
    zSig = ( aSig + bSig )<<1;
3083
    --zExp;
3084
    if ( (sbits64) zSig < 0 ) {
3085
        zSig = aSig + bSig;
3086
        ++zExp;
3087
    }
3088
 roundAndPack:
3089
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3090

    
3091
}
3092

    
3093
/*----------------------------------------------------------------------------
3094
| Returns the result of subtracting the absolute values of the double-
3095
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3096
| difference is negated before being returned.  `zSign' is ignored if the
3097
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3098
| Standard for Binary Floating-Point Arithmetic.
3099
*----------------------------------------------------------------------------*/
3100

    
3101
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3102
{
3103
    int16 aExp, bExp, zExp;
3104
    bits64 aSig, bSig, zSig;
3105
    int16 expDiff;
3106

    
3107
    aSig = extractFloat64Frac( a );
3108
    aExp = extractFloat64Exp( a );
3109
    bSig = extractFloat64Frac( b );
3110
    bExp = extractFloat64Exp( b );
3111
    expDiff = aExp - bExp;
3112
    aSig <<= 10;
3113
    bSig <<= 10;
3114
    if ( 0 < expDiff ) goto aExpBigger;
3115
    if ( expDiff < 0 ) goto bExpBigger;
3116
    if ( aExp == 0x7FF ) {
3117
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3118
        float_raise( float_flag_invalid STATUS_VAR);
3119
        return float64_default_nan;
3120
    }
3121
    if ( aExp == 0 ) {
3122
        aExp = 1;
3123
        bExp = 1;
3124
    }
3125
    if ( bSig < aSig ) goto aBigger;
3126
    if ( aSig < bSig ) goto bBigger;
3127
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3128
 bExpBigger:
3129
    if ( bExp == 0x7FF ) {
3130
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3131
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3132
    }
3133
    if ( aExp == 0 ) {
3134
        ++expDiff;
3135
    }
3136
    else {
3137
        aSig |= LIT64( 0x4000000000000000 );
3138
    }
3139
    shift64RightJamming( aSig, - expDiff, &aSig );
3140
    bSig |= LIT64( 0x4000000000000000 );
3141
 bBigger:
3142
    zSig = bSig - aSig;
3143
    zExp = bExp;
3144
    zSign ^= 1;
3145
    goto normalizeRoundAndPack;
3146
 aExpBigger:
3147
    if ( aExp == 0x7FF ) {
3148
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3149
        return a;
3150
    }
3151
    if ( bExp == 0 ) {
3152
        --expDiff;
3153
    }
3154
    else {
3155
        bSig |= LIT64( 0x4000000000000000 );
3156
    }
3157
    shift64RightJamming( bSig, expDiff, &bSig );
3158
    aSig |= LIT64( 0x4000000000000000 );
3159
 aBigger:
3160
    zSig = aSig - bSig;
3161
    zExp = aExp;
3162
 normalizeRoundAndPack:
3163
    --zExp;
3164
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3165

    
3166
}
3167

    
3168
/*----------------------------------------------------------------------------
3169
| Returns the result of adding the double-precision floating-point values `a'
3170
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3171
| Binary Floating-Point Arithmetic.
3172
*----------------------------------------------------------------------------*/
3173

    
3174
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3175
{
3176
    flag aSign, bSign;
3177
    a = float64_squash_input_denormal(a STATUS_VAR);
3178
    b = float64_squash_input_denormal(b STATUS_VAR);
3179

    
3180
    aSign = extractFloat64Sign( a );
3181
    bSign = extractFloat64Sign( b );
3182
    if ( aSign == bSign ) {
3183
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3184
    }
3185
    else {
3186
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3187
    }
3188

    
3189
}
3190

    
3191
/*----------------------------------------------------------------------------
3192
| Returns the result of subtracting the double-precision floating-point values
3193
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3194
| for Binary Floating-Point Arithmetic.
3195
*----------------------------------------------------------------------------*/
3196

    
3197
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3198
{
3199
    flag aSign, bSign;
3200
    a = float64_squash_input_denormal(a STATUS_VAR);
3201
    b = float64_squash_input_denormal(b STATUS_VAR);
3202

    
3203
    aSign = extractFloat64Sign( a );
3204
    bSign = extractFloat64Sign( b );
3205
    if ( aSign == bSign ) {
3206
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3207
    }
3208
    else {
3209
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3210
    }
3211

    
3212
}
3213

    
3214
/*----------------------------------------------------------------------------
3215
| Returns the result of multiplying the double-precision floating-point values
3216
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3217
| for Binary Floating-Point Arithmetic.
3218
*----------------------------------------------------------------------------*/
3219

    
3220
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3221
{
3222
    flag aSign, bSign, zSign;
3223
    int16 aExp, bExp, zExp;
3224
    bits64 aSig, bSig, zSig0, zSig1;
3225

    
3226
    a = float64_squash_input_denormal(a STATUS_VAR);
3227
    b = float64_squash_input_denormal(b STATUS_VAR);
3228

    
3229
    aSig = extractFloat64Frac( a );
3230
    aExp = extractFloat64Exp( a );
3231
    aSign = extractFloat64Sign( a );
3232
    bSig = extractFloat64Frac( b );
3233
    bExp = extractFloat64Exp( b );
3234
    bSign = extractFloat64Sign( b );
3235
    zSign = aSign ^ bSign;
3236
    if ( aExp == 0x7FF ) {
3237
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3238
            return propagateFloat64NaN( a, b STATUS_VAR );
3239
        }
3240
        if ( ( bExp | bSig ) == 0 ) {
3241
            float_raise( float_flag_invalid STATUS_VAR);
3242
            return float64_default_nan;
3243
        }
3244
        return packFloat64( zSign, 0x7FF, 0 );
3245
    }
3246
    if ( bExp == 0x7FF ) {
3247
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3248
        if ( ( aExp | aSig ) == 0 ) {
3249
            float_raise( float_flag_invalid STATUS_VAR);
3250
            return float64_default_nan;
3251
        }
3252
        return packFloat64( zSign, 0x7FF, 0 );
3253
    }
3254
    if ( aExp == 0 ) {
3255
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3256
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3257
    }
3258
    if ( bExp == 0 ) {
3259
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3260
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3261
    }
3262
    zExp = aExp + bExp - 0x3FF;
3263
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3264
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3265
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3266
    zSig0 |= ( zSig1 != 0 );
3267
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3268
        zSig0 <<= 1;
3269
        --zExp;
3270
    }
3271
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3272

    
3273
}
3274

    
3275
/*----------------------------------------------------------------------------
3276
| Returns the result of dividing the double-precision floating-point value `a'
3277
| by the corresponding value `b'.  The operation is performed according to
3278
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3279
*----------------------------------------------------------------------------*/
3280

    
3281
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3282
{
3283
    flag aSign, bSign, zSign;
3284
    int16 aExp, bExp, zExp;
3285
    bits64 aSig, bSig, zSig;
3286
    bits64 rem0, rem1;
3287
    bits64 term0, term1;
3288
    a = float64_squash_input_denormal(a STATUS_VAR);
3289
    b = float64_squash_input_denormal(b STATUS_VAR);
3290

    
3291
    aSig = extractFloat64Frac( a );
3292
    aExp = extractFloat64Exp( a );
3293
    aSign = extractFloat64Sign( a );
3294
    bSig = extractFloat64Frac( b );
3295
    bExp = extractFloat64Exp( b );
3296
    bSign = extractFloat64Sign( b );
3297
    zSign = aSign ^ bSign;
3298
    if ( aExp == 0x7FF ) {
3299
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3300
        if ( bExp == 0x7FF ) {
3301
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3302
            float_raise( float_flag_invalid STATUS_VAR);
3303
            return float64_default_nan;
3304
        }
3305
        return packFloat64( zSign, 0x7FF, 0 );
3306
    }
3307
    if ( bExp == 0x7FF ) {
3308
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3309
        return packFloat64( zSign, 0, 0 );
3310
    }
3311
    if ( bExp == 0 ) {
3312
        if ( bSig == 0 ) {
3313
            if ( ( aExp | aSig ) == 0 ) {
3314
                float_raise( float_flag_invalid STATUS_VAR);
3315
                return float64_default_nan;
3316
            }
3317
            float_raise( float_flag_divbyzero STATUS_VAR);
3318
            return packFloat64( zSign, 0x7FF, 0 );
3319
        }
3320
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3321
    }
3322
    if ( aExp == 0 ) {
3323
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3324
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3325
    }
3326
    zExp = aExp - bExp + 0x3FD;
3327
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3328
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3329
    if ( bSig <= ( aSig + aSig ) ) {
3330
        aSig >>= 1;
3331
        ++zExp;
3332
    }
3333
    zSig = estimateDiv128To64( aSig, 0, bSig );
3334
    if ( ( zSig & 0x1FF ) <= 2 ) {
3335
        mul64To128( bSig, zSig, &term0, &term1 );
3336
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3337
        while ( (sbits64) rem0 < 0 ) {
3338
            --zSig;
3339
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3340
        }
3341
        zSig |= ( rem1 != 0 );
3342
    }
3343
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3344

    
3345
}
3346

    
3347
/*----------------------------------------------------------------------------
3348
| Returns the remainder of the double-precision floating-point value `a'
3349
| with respect to the corresponding value `b'.  The operation is performed
3350
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3351
*----------------------------------------------------------------------------*/
3352

    
3353
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3354
{
3355
    flag aSign, zSign;
3356
    int16 aExp, bExp, expDiff;
3357
    bits64 aSig, bSig;
3358
    bits64 q, alternateASig;
3359
    sbits64 sigMean;
3360

    
3361
    a = float64_squash_input_denormal(a STATUS_VAR);
3362
    b = float64_squash_input_denormal(b STATUS_VAR);
3363
    aSig = extractFloat64Frac( a );
3364
    aExp = extractFloat64Exp( a );
3365
    aSign = extractFloat64Sign( a );
3366
    bSig = extractFloat64Frac( b );
3367
    bExp = extractFloat64Exp( b );
3368
    if ( aExp == 0x7FF ) {
3369
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3370
            return propagateFloat64NaN( a, b STATUS_VAR );
3371
        }
3372
        float_raise( float_flag_invalid STATUS_VAR);
3373
        return float64_default_nan;
3374
    }
3375
    if ( bExp == 0x7FF ) {
3376
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3377
        return a;
3378
    }
3379
    if ( bExp == 0 ) {
3380
        if ( bSig == 0 ) {
3381
            float_raise( float_flag_invalid STATUS_VAR);
3382
            return float64_default_nan;
3383
        }
3384
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3385
    }
3386
    if ( aExp == 0 ) {
3387
        if ( aSig == 0 ) return a;
3388
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3389
    }
3390
    expDiff = aExp - bExp;
3391
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3392
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3393
    if ( expDiff < 0 ) {
3394
        if ( expDiff < -1 ) return a;
3395
        aSig >>= 1;
3396
    }
3397
    q = ( bSig <= aSig );
3398
    if ( q ) aSig -= bSig;
3399
    expDiff -= 64;
3400
    while ( 0 < expDiff ) {
3401
        q = estimateDiv128To64( aSig, 0, bSig );
3402
        q = ( 2 < q ) ? q - 2 : 0;
3403
        aSig = - ( ( bSig>>2 ) * q );
3404
        expDiff -= 62;
3405
    }
3406
    expDiff += 64;
3407
    if ( 0 < expDiff ) {
3408
        q = estimateDiv128To64( aSig, 0, bSig );
3409
        q = ( 2 < q ) ? q - 2 : 0;
3410
        q >>= 64 - expDiff;
3411
        bSig >>= 2;
3412
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3413
    }
3414
    else {
3415
        aSig >>= 2;
3416
        bSig >>= 2;
3417
    }
3418
    do {
3419
        alternateASig = aSig;
3420
        ++q;
3421
        aSig -= bSig;
3422
    } while ( 0 <= (sbits64) aSig );
3423
    sigMean = aSig + alternateASig;
3424
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3425
        aSig = alternateASig;
3426
    }
3427
    zSign = ( (sbits64) aSig < 0 );
3428
    if ( zSign ) aSig = - aSig;
3429
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3430

    
3431
}
3432

    
3433
/*----------------------------------------------------------------------------
3434
| Returns the square root of the double-precision floating-point value `a'.
3435
| The operation is performed according to the IEC/IEEE Standard for Binary
3436
| Floating-Point Arithmetic.
3437
*----------------------------------------------------------------------------*/
3438

    
3439
float64 float64_sqrt( float64 a STATUS_PARAM )
3440
{
3441
    flag aSign;
3442
    int16 aExp, zExp;
3443
    bits64 aSig, zSig, doubleZSig;
3444
    bits64 rem0, rem1, term0, term1;
3445
    a = float64_squash_input_denormal(a STATUS_VAR);
3446

    
3447
    aSig = extractFloat64Frac( a );
3448
    aExp = extractFloat64Exp( a );
3449
    aSign = extractFloat64Sign( a );
3450
    if ( aExp == 0x7FF ) {
3451
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3452
        if ( ! aSign ) return a;
3453
        float_raise( float_flag_invalid STATUS_VAR);
3454
        return float64_default_nan;
3455
    }
3456
    if ( aSign ) {
3457
        if ( ( aExp | aSig ) == 0 ) return a;
3458
        float_raise( float_flag_invalid STATUS_VAR);
3459
        return float64_default_nan;
3460
    }
3461
    if ( aExp == 0 ) {
3462
        if ( aSig == 0 ) return float64_zero;
3463
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3464
    }
3465
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3466
    aSig |= LIT64( 0x0010000000000000 );
3467
    zSig = estimateSqrt32( aExp, aSig>>21 );
3468
    aSig <<= 9 - ( aExp & 1 );
3469
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3470
    if ( ( zSig & 0x1FF ) <= 5 ) {
3471
        doubleZSig = zSig<<1;
3472
        mul64To128( zSig, zSig, &term0, &term1 );
3473
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3474
        while ( (sbits64) rem0 < 0 ) {
3475
            --zSig;
3476
            doubleZSig -= 2;
3477
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3478
        }
3479
        zSig |= ( ( rem0 | rem1 ) != 0 );
3480
    }
3481
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3482

    
3483
}
3484

    
3485
/*----------------------------------------------------------------------------
3486
| Returns the binary log of the double-precision floating-point value `a'.
3487
| The operation is performed according to the IEC/IEEE Standard for Binary
3488
| Floating-Point Arithmetic.
3489
*----------------------------------------------------------------------------*/
3490
float64 float64_log2( float64 a STATUS_PARAM )
3491
{
3492
    flag aSign, zSign;
3493
    int16 aExp;
3494
    bits64 aSig, aSig0, aSig1, zSig, i;
3495
    a = float64_squash_input_denormal(a STATUS_VAR);
3496

    
3497
    aSig = extractFloat64Frac( a );
3498
    aExp = extractFloat64Exp( a );
3499
    aSign = extractFloat64Sign( a );
3500

    
3501
    if ( aExp == 0 ) {
3502
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3503
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3504
    }
3505
    if ( aSign ) {
3506
        float_raise( float_flag_invalid STATUS_VAR);
3507
        return float64_default_nan;
3508
    }
3509
    if ( aExp == 0x7FF ) {
3510
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3511
        return a;
3512
    }
3513

    
3514
    aExp -= 0x3FF;
3515
    aSig |= LIT64( 0x0010000000000000 );
3516
    zSign = aExp < 0;
3517
    zSig = (bits64)aExp << 52;
3518
    for (i = 1LL << 51; i > 0; i >>= 1) {
3519
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3520
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3521
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3522
            aSig >>= 1;
3523
            zSig |= i;
3524
        }
3525
    }
3526

    
3527
    if ( zSign )
3528
        zSig = -zSig;
3529
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3530
}
3531

    
3532
/*----------------------------------------------------------------------------
3533
| Returns 1 if the double-precision floating-point value `a' is equal to the
3534
| corresponding value `b', and 0 otherwise.  The comparison is performed
3535
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3536
*----------------------------------------------------------------------------*/
3537

    
3538
int float64_eq( float64 a, float64 b STATUS_PARAM )
3539
{
3540
    bits64 av, bv;
3541
    a = float64_squash_input_denormal(a STATUS_VAR);
3542
    b = float64_squash_input_denormal(b STATUS_VAR);
3543

    
3544
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3545
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3546
       ) {
3547
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3548
            float_raise( float_flag_invalid STATUS_VAR);
3549
        }
3550
        return 0;
3551
    }
3552
    av = float64_val(a);
3553
    bv = float64_val(b);
3554
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3555

    
3556
}
3557

    
3558
/*----------------------------------------------------------------------------
3559
| Returns 1 if the double-precision floating-point value `a' is less than or
3560
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3561
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3562
| Arithmetic.
3563
*----------------------------------------------------------------------------*/
3564

    
3565
int float64_le( float64 a, float64 b STATUS_PARAM )
3566
{
3567
    flag aSign, bSign;
3568
    bits64 av, bv;
3569
    a = float64_squash_input_denormal(a STATUS_VAR);
3570
    b = float64_squash_input_denormal(b STATUS_VAR);
3571

    
3572
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3573
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3574
       ) {
3575
        float_raise( float_flag_invalid STATUS_VAR);
3576
        return 0;
3577
    }
3578
    aSign = extractFloat64Sign( a );
3579
    bSign = extractFloat64Sign( b );
3580
    av = float64_val(a);
3581
    bv = float64_val(b);
3582
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3583
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3584

    
3585
}
3586

    
3587
/*----------------------------------------------------------------------------
3588
| Returns 1 if the double-precision floating-point value `a' is less than
3589
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3590
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591
*----------------------------------------------------------------------------*/
3592

    
3593
int float64_lt( float64 a, float64 b STATUS_PARAM )
3594
{
3595
    flag aSign, bSign;
3596
    bits64 av, bv;
3597

    
3598
    a = float64_squash_input_denormal(a STATUS_VAR);
3599
    b = float64_squash_input_denormal(b STATUS_VAR);
3600
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3601
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3602
       ) {
3603
        float_raise( float_flag_invalid STATUS_VAR);
3604
        return 0;
3605
    }
3606
    aSign = extractFloat64Sign( a );
3607
    bSign = extractFloat64Sign( b );
3608
    av = float64_val(a);
3609
    bv = float64_val(b);
3610
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3611
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3612

    
3613
}
3614

    
3615
/*----------------------------------------------------------------------------
3616
| Returns 1 if the double-precision floating-point value `a' is equal to the
3617
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3618
| if either operand is a NaN.  Otherwise, the comparison is performed
3619
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3620
*----------------------------------------------------------------------------*/
3621

    
3622
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3623
{
3624
    bits64 av, bv;
3625
    a = float64_squash_input_denormal(a STATUS_VAR);
3626
    b = float64_squash_input_denormal(b STATUS_VAR);
3627

    
3628
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3629
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3630
       ) {
3631
        float_raise( float_flag_invalid STATUS_VAR);
3632
        return 0;
3633
    }
3634
    av = float64_val(a);
3635
    bv = float64_val(b);
3636
    return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3637

    
3638
}
3639

    
3640
/*----------------------------------------------------------------------------
3641
| Returns 1 if the double-precision floating-point value `a' is less than or
3642
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3643
| cause an exception.  Otherwise, the comparison is performed according to the
3644
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3645
*----------------------------------------------------------------------------*/
3646

    
3647
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3648
{
3649
    flag aSign, bSign;
3650
    bits64 av, bv;
3651
    a = float64_squash_input_denormal(a STATUS_VAR);
3652
    b = float64_squash_input_denormal(b STATUS_VAR);
3653

    
3654
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3655
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3656
       ) {
3657
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3658
            float_raise( float_flag_invalid STATUS_VAR);
3659
        }
3660
        return 0;
3661
    }
3662
    aSign = extractFloat64Sign( a );
3663
    bSign = extractFloat64Sign( b );
3664
    av = float64_val(a);
3665
    bv = float64_val(b);
3666
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3667
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3668

    
3669
}
3670

    
3671
/*----------------------------------------------------------------------------
3672
| Returns 1 if the double-precision floating-point value `a' is less than
3673
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3674
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3675
| Standard for Binary Floating-Point Arithmetic.
3676
*----------------------------------------------------------------------------*/
3677

    
3678
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3679
{
3680
    flag aSign, bSign;
3681
    bits64 av, bv;
3682
    a = float64_squash_input_denormal(a STATUS_VAR);
3683
    b = float64_squash_input_denormal(b STATUS_VAR);
3684

    
3685
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3686
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3687
       ) {
3688
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3689
            float_raise( float_flag_invalid STATUS_VAR);
3690
        }
3691
        return 0;
3692
    }
3693
    aSign = extractFloat64Sign( a );
3694
    bSign = extractFloat64Sign( b );
3695
    av = float64_val(a);
3696
    bv = float64_val(b);
3697
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3698
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3699

    
3700
}
3701

    
3702
#ifdef FLOATX80
3703

    
3704
/*----------------------------------------------------------------------------
3705
| Returns the result of converting the extended double-precision floating-
3706
| point value `a' to the 32-bit two's complement integer format.  The
3707
| conversion is performed according to the IEC/IEEE Standard for Binary
3708
| Floating-Point Arithmetic---which means in particular that the conversion
3709
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3710
| largest positive integer is returned.  Otherwise, if the conversion
3711
| overflows, the largest integer with the same sign as `a' is returned.
3712
*----------------------------------------------------------------------------*/
3713

    
3714
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3715
{
3716
    flag aSign;
3717
    int32 aExp, shiftCount;
3718
    bits64 aSig;
3719

    
3720
    aSig = extractFloatx80Frac( a );
3721
    aExp = extractFloatx80Exp( a );
3722
    aSign = extractFloatx80Sign( a );
3723
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3724
    shiftCount = 0x4037 - aExp;
3725
    if ( shiftCount <= 0 ) shiftCount = 1;
3726
    shift64RightJamming( aSig, shiftCount, &aSig );
3727
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3728

    
3729
}
3730

    
3731
/*----------------------------------------------------------------------------
3732
| Returns the result of converting the extended double-precision floating-
3733
| point value `a' to the 32-bit two's complement integer format.  The
3734
| conversion is performed according to the IEC/IEEE Standard for Binary
3735
| Floating-Point Arithmetic, except that the conversion is always rounded
3736
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3737
| Otherwise, if the conversion overflows, the largest integer with the same
3738
| sign as `a' is returned.
3739
*----------------------------------------------------------------------------*/
3740

    
3741
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3742
{
3743
    flag aSign;
3744
    int32 aExp, shiftCount;
3745
    bits64 aSig, savedASig;
3746
    int32 z;
3747

    
3748
    aSig = extractFloatx80Frac( a );
3749
    aExp = extractFloatx80Exp( a );
3750
    aSign = extractFloatx80Sign( a );
3751
    if ( 0x401E < aExp ) {
3752
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3753
        goto invalid;
3754
    }
3755
    else if ( aExp < 0x3FFF ) {
3756
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3757
        return 0;
3758
    }
3759
    shiftCount = 0x403E - aExp;
3760
    savedASig = aSig;
3761
    aSig >>= shiftCount;
3762
    z = aSig;
3763
    if ( aSign ) z = - z;
3764
    if ( ( z < 0 ) ^ aSign ) {
3765
 invalid:
3766
        float_raise( float_flag_invalid STATUS_VAR);
3767
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3768
    }
3769
    if ( ( aSig<<shiftCount ) != savedASig ) {
3770
        STATUS(float_exception_flags) |= float_flag_inexact;
3771
    }
3772
    return z;
3773

    
3774
}
3775

    
3776
/*----------------------------------------------------------------------------
3777
| Returns the result of converting the extended double-precision floating-
3778
| point value `a' to the 64-bit two's complement integer format.  The
3779
| conversion is performed according to the IEC/IEEE Standard for Binary
3780
| Floating-Point Arithmetic---which means in particular that the conversion
3781
| is rounded according to the current rounding mode.  If `a' is a NaN,
3782
| the largest positive integer is returned.  Otherwise, if the conversion
3783
| overflows, the largest integer with the same sign as `a' is returned.
3784
*----------------------------------------------------------------------------*/
3785

    
3786
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3787
{
3788
    flag aSign;
3789
    int32 aExp, shiftCount;
3790
    bits64 aSig, aSigExtra;
3791

    
3792
    aSig = extractFloatx80Frac( a );
3793
    aExp = extractFloatx80Exp( a );
3794
    aSign = extractFloatx80Sign( a );
3795
    shiftCount = 0x403E - aExp;
3796
    if ( shiftCount <= 0 ) {
3797
        if ( shiftCount ) {
3798
            float_raise( float_flag_invalid STATUS_VAR);
3799
            if (    ! aSign
3800
                 || (    ( aExp == 0x7FFF )
3801
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3802
               ) {
3803
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3804
            }
3805
            return (sbits64) LIT64( 0x8000000000000000 );
3806
        }
3807
        aSigExtra = 0;
3808
    }
3809
    else {
3810
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3811
    }
3812
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3813

    
3814
}
3815

    
3816
/*----------------------------------------------------------------------------
3817
| Returns the result of converting the extended double-precision floating-
3818
| point value `a' to the 64-bit two's complement integer format.  The
3819
| conversion is performed according to the IEC/IEEE Standard for Binary
3820
| Floating-Point Arithmetic, except that the conversion is always rounded
3821
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3822
| Otherwise, if the conversion overflows, the largest integer with the same
3823
| sign as `a' is returned.
3824
*----------------------------------------------------------------------------*/
3825

    
3826
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3827
{
3828
    flag aSign;
3829
    int32 aExp, shiftCount;
3830
    bits64 aSig;
3831
    int64 z;
3832

    
3833
    aSig = extractFloatx80Frac( a );
3834
    aExp = extractFloatx80Exp( a );
3835
    aSign = extractFloatx80Sign( a );
3836
    shiftCount = aExp - 0x403E;
3837
    if ( 0 <= shiftCount ) {
3838
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3839
        if ( ( a.high != 0xC03E ) || aSig ) {
3840
            float_raise( float_flag_invalid STATUS_VAR);
3841
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3842
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3843
            }
3844
        }
3845
        return (sbits64) LIT64( 0x8000000000000000 );
3846
    }
3847
    else if ( aExp < 0x3FFF ) {
3848
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3849
        return 0;
3850
    }
3851
    z = aSig>>( - shiftCount );
3852
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3853
        STATUS(float_exception_flags) |= float_flag_inexact;
3854
    }
3855
    if ( aSign ) z = - z;
3856
    return z;
3857

    
3858
}
3859

    
3860
/*----------------------------------------------------------------------------
3861
| Returns the result of converting the extended double-precision floating-
3862
| point value `a' to the single-precision floating-point format.  The
3863
| conversion is performed according to the IEC/IEEE Standard for Binary
3864
| Floating-Point Arithmetic.
3865
*----------------------------------------------------------------------------*/
3866

    
3867
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3868
{
3869
    flag aSign;
3870
    int32 aExp;
3871
    bits64 aSig;
3872

    
3873
    aSig = extractFloatx80Frac( a );
3874
    aExp = extractFloatx80Exp( a );
3875
    aSign = extractFloatx80Sign( a );
3876
    if ( aExp == 0x7FFF ) {
3877
        if ( (bits64) ( aSig<<1 ) ) {
3878
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3879
        }
3880
        return packFloat32( aSign, 0xFF, 0 );
3881
    }
3882
    shift64RightJamming( aSig, 33, &aSig );
3883
    if ( aExp || aSig ) aExp -= 0x3F81;
3884
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3885

    
3886
}
3887

    
3888
/*----------------------------------------------------------------------------
3889
| Returns the result of converting the extended double-precision floating-
3890
| point value `a' to the double-precision floating-point format.  The
3891
| conversion is performed according to the IEC/IEEE Standard for Binary
3892
| Floating-Point Arithmetic.
3893
*----------------------------------------------------------------------------*/
3894

    
3895
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3896
{
3897
    flag aSign;
3898
    int32 aExp;
3899
    bits64 aSig, zSig;
3900

    
3901
    aSig = extractFloatx80Frac( a );
3902
    aExp = extractFloatx80Exp( a );
3903
    aSign = extractFloatx80Sign( a );
3904
    if ( aExp == 0x7FFF ) {
3905
        if ( (bits64) ( aSig<<1 ) ) {
3906
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3907
        }
3908
        return packFloat64( aSign, 0x7FF, 0 );
3909
    }
3910
    shift64RightJamming( aSig, 1, &zSig );
3911
    if ( aExp || aSig ) aExp -= 0x3C01;
3912
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3913

    
3914
}
3915

    
3916
#ifdef FLOAT128
3917

    
3918
/*----------------------------------------------------------------------------
3919
| Returns the result of converting the extended double-precision floating-
3920
| point value `a' to the quadruple-precision floating-point format.  The
3921
| conversion is performed according to the IEC/IEEE Standard for Binary
3922
| Floating-Point Arithmetic.
3923
*----------------------------------------------------------------------------*/
3924

    
3925
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3926
{
3927
    flag aSign;
3928
    int16 aExp;
3929
    bits64 aSig, zSig0, zSig1;
3930

    
3931
    aSig = extractFloatx80Frac( a );
3932
    aExp = extractFloatx80Exp( a );
3933
    aSign = extractFloatx80Sign( a );
3934
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3935
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3936
    }
3937
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3938
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3939

    
3940
}
3941

    
3942
#endif
3943

    
3944
/*----------------------------------------------------------------------------
3945
| Rounds the extended double-precision floating-point value `a' to an integer,
3946
| and returns the result as an extended quadruple-precision floating-point
3947
| value.  The operation is performed according to the IEC/IEEE Standard for
3948
| Binary Floating-Point Arithmetic.
3949
*----------------------------------------------------------------------------*/
3950

    
3951
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3952
{
3953
    flag aSign;
3954
    int32 aExp;
3955
    bits64 lastBitMask, roundBitsMask;
3956
    int8 roundingMode;
3957
    floatx80 z;
3958

    
3959
    aExp = extractFloatx80Exp( a );
3960
    if ( 0x403E <= aExp ) {
3961
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3962
            return propagateFloatx80NaN( a, a STATUS_VAR );
3963
        }
3964
        return a;
3965
    }
3966
    if ( aExp < 0x3FFF ) {
3967
        if (    ( aExp == 0 )
3968
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3969
            return a;
3970
        }
3971
        STATUS(float_exception_flags) |= float_flag_inexact;
3972
        aSign = extractFloatx80Sign( a );
3973
        switch ( STATUS(float_rounding_mode) ) {
3974
         case float_round_nearest_even:
3975
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3976
               ) {
3977
                return
3978
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3979
            }
3980
            break;
3981
         case float_round_down:
3982
            return
3983
                  aSign ?
3984
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3985
                : packFloatx80( 0, 0, 0 );
3986
         case float_round_up:
3987
            return
3988
                  aSign ? packFloatx80( 1, 0, 0 )
3989
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3990
        }
3991
        return packFloatx80( aSign, 0, 0 );
3992
    }
3993
    lastBitMask = 1;
3994
    lastBitMask <<= 0x403E - aExp;
3995
    roundBitsMask = lastBitMask - 1;
3996
    z = a;
3997
    roundingMode = STATUS(float_rounding_mode);
3998
    if ( roundingMode == float_round_nearest_even ) {
3999
        z.low += lastBitMask>>1;
4000
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4001
    }
4002
    else if ( roundingMode != float_round_to_zero ) {
4003
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4004
            z.low += roundBitsMask;
4005
        }
4006
    }
4007
    z.low &= ~ roundBitsMask;
4008
    if ( z.low == 0 ) {
4009
        ++z.high;
4010
        z.low = LIT64( 0x8000000000000000 );
4011
    }
4012
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4013
    return z;
4014

    
4015
}
4016

    
4017
/*----------------------------------------------------------------------------
4018
| Returns the result of adding the absolute values of the extended double-
4019
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4020
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4021
| The addition is performed according to the IEC/IEEE Standard for Binary
4022
| Floating-Point Arithmetic.
4023
*----------------------------------------------------------------------------*/
4024

    
4025
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4026
{
4027
    int32 aExp, bExp, zExp;
4028
    bits64 aSig, bSig, zSig0, zSig1;
4029
    int32 expDiff;
4030

    
4031
    aSig = extractFloatx80Frac( a );
4032
    aExp = extractFloatx80Exp( a );
4033
    bSig = extractFloatx80Frac( b );
4034
    bExp = extractFloatx80Exp( b );
4035
    expDiff = aExp - bExp;
4036
    if ( 0 < expDiff ) {
4037
        if ( aExp == 0x7FFF ) {
4038
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4039
            return a;
4040
        }
4041
        if ( bExp == 0 ) --expDiff;
4042
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4043
        zExp = aExp;
4044
    }
4045
    else if ( expDiff < 0 ) {
4046
        if ( bExp == 0x7FFF ) {
4047
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4048
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4049
        }
4050
        if ( aExp == 0 ) ++expDiff;
4051
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4052
        zExp = bExp;
4053
    }
4054
    else {
4055
        if ( aExp == 0x7FFF ) {
4056
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4057
                return propagateFloatx80NaN( a, b STATUS_VAR );
4058
            }
4059
            return a;
4060
        }
4061
        zSig1 = 0;
4062
        zSig0 = aSig + bSig;
4063
        if ( aExp == 0 ) {
4064
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4065
            goto roundAndPack;
4066
        }
4067
        zExp = aExp;
4068
        goto shiftRight1;
4069
    }
4070
    zSig0 = aSig + bSig;
4071
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4072
 shiftRight1:
4073
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4074
    zSig0 |= LIT64( 0x8000000000000000 );
4075
    ++zExp;
4076
 roundAndPack:
4077
    return
4078
        roundAndPackFloatx80(
4079
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4080

    
4081
}
4082

    
4083
/*----------------------------------------------------------------------------
4084
| Returns the result of subtracting the absolute values of the extended
4085
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4086
| difference is negated before being returned.  `zSign' is ignored if the
4087
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4088
| Standard for Binary Floating-Point Arithmetic.
4089
*----------------------------------------------------------------------------*/
4090

    
4091
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4092
{
4093
    int32 aExp, bExp, zExp;
4094
    bits64 aSig, bSig, zSig0, zSig1;
4095
    int32 expDiff;
4096
    floatx80 z;
4097

    
4098
    aSig = extractFloatx80Frac( a );
4099
    aExp = extractFloatx80Exp( a );
4100
    bSig = extractFloatx80Frac( b );
4101
    bExp = extractFloatx80Exp( b );
4102
    expDiff = aExp - bExp;
4103
    if ( 0 < expDiff ) goto aExpBigger;
4104
    if ( expDiff < 0 ) goto bExpBigger;
4105
    if ( aExp == 0x7FFF ) {
4106
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4107
            return propagateFloatx80NaN( a, b STATUS_VAR );
4108
        }
4109
        float_raise( float_flag_invalid STATUS_VAR);
4110
        z.low = floatx80_default_nan_low;
4111
        z.high = floatx80_default_nan_high;
4112
        return z;
4113
    }
4114
    if ( aExp == 0 ) {
4115
        aExp = 1;
4116
        bExp = 1;
4117
    }
4118
    zSig1 = 0;
4119
    if ( bSig < aSig ) goto aBigger;
4120
    if ( aSig < bSig ) goto bBigger;
4121
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4122
 bExpBigger:
4123
    if ( bExp == 0x7FFF ) {
4124
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4125
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4126
    }
4127
    if ( aExp == 0 ) ++expDiff;
4128
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4129
 bBigger:
4130
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4131
    zExp = bExp;
4132
    zSign ^= 1;
4133
    goto normalizeRoundAndPack;
4134
 aExpBigger:
4135
    if ( aExp == 0x7FFF ) {
4136
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4137
        return a;
4138
    }
4139
    if ( bExp == 0 ) --expDiff;
4140
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4141
 aBigger:
4142
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4143
    zExp = aExp;
4144
 normalizeRoundAndPack:
4145
    return
4146
        normalizeRoundAndPackFloatx80(
4147
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4148

    
4149
}
4150

    
4151
/*----------------------------------------------------------------------------
4152
| Returns the result of adding the extended double-precision floating-point
4153
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4154
| Standard for Binary Floating-Point Arithmetic.
4155
*----------------------------------------------------------------------------*/
4156

    
4157
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4158
{
4159
    flag aSign, bSign;
4160

    
4161
    aSign = extractFloatx80Sign( a );
4162
    bSign = extractFloatx80Sign( b );
4163
    if ( aSign == bSign ) {
4164
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4165
    }
4166
    else {
4167
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4168
    }
4169

    
4170
}
4171

    
4172
/*----------------------------------------------------------------------------
4173
| Returns the result of subtracting the extended double-precision floating-
4174
| point values `a' and `b'.  The operation is performed according to the
4175
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4176
*----------------------------------------------------------------------------*/
4177

    
4178
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4179
{
4180
    flag aSign, bSign;
4181

    
4182
    aSign = extractFloatx80Sign( a );
4183
    bSign = extractFloatx80Sign( b );
4184
    if ( aSign == bSign ) {
4185
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4186
    }
4187
    else {
4188
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4189
    }
4190

    
4191
}
4192

    
4193
/*----------------------------------------------------------------------------
4194
| Returns the result of multiplying the extended double-precision floating-
4195
| point values `a' and `b'.  The operation is performed according to the
4196
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4197
*----------------------------------------------------------------------------*/
4198

    
4199
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4200
{
4201
    flag aSign, bSign, zSign;
4202
    int32 aExp, bExp, zExp;
4203
    bits64 aSig, bSig, zSig0, zSig1;
4204
    floatx80 z;
4205

    
4206
    aSig = extractFloatx80Frac( a );
4207
    aExp = extractFloatx80Exp( a );
4208
    aSign = extractFloatx80Sign( a );
4209
    bSig = extractFloatx80Frac( b );
4210
    bExp = extractFloatx80Exp( b );
4211
    bSign = extractFloatx80Sign( b );
4212
    zSign = aSign ^ bSign;
4213
    if ( aExp == 0x7FFF ) {
4214
        if (    (bits64) ( aSig<<1 )
4215
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4216
            return propagateFloatx80NaN( a, b STATUS_VAR );
4217
        }
4218
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4219
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4220
    }
4221
    if ( bExp == 0x7FFF ) {
4222
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4223
        if ( ( aExp | aSig ) == 0 ) {
4224
 invalid:
4225
            float_raise( float_flag_invalid STATUS_VAR);
4226
            z.low = floatx80_default_nan_low;
4227
            z.high = floatx80_default_nan_high;
4228
            return z;
4229
        }
4230
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4231
    }
4232
    if ( aExp == 0 ) {
4233
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4234
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4235
    }
4236
    if ( bExp == 0 ) {
4237
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4238
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4239
    }
4240
    zExp = aExp + bExp - 0x3FFE;
4241
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4242
    if ( 0 < (sbits64) zSig0 ) {
4243
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4244
        --zExp;
4245
    }
4246
    return
4247
        roundAndPackFloatx80(
4248
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4249

    
4250
}
4251

    
4252
/*----------------------------------------------------------------------------
4253
| Returns the result of dividing the extended double-precision floating-point
4254
| value `a' by the corresponding value `b'.  The operation is performed
4255
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256
*----------------------------------------------------------------------------*/
4257

    
4258
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4259
{
4260
    flag aSign, bSign, zSign;
4261
    int32 aExp, bExp, zExp;
4262
    bits64 aSig, bSig, zSig0, zSig1;
4263
    bits64 rem0, rem1, rem2, term0, term1, term2;
4264
    floatx80 z;
4265

    
4266
    aSig = extractFloatx80Frac( a );
4267
    aExp = extractFloatx80Exp( a );
4268
    aSign = extractFloatx80Sign( a );
4269
    bSig = extractFloatx80Frac( b );
4270
    bExp = extractFloatx80Exp( b );
4271
    bSign = extractFloatx80Sign( b );
4272
    zSign = aSign ^ bSign;
4273
    if ( aExp == 0x7FFF ) {
4274
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4275
        if ( bExp == 0x7FFF ) {
4276
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4277
            goto invalid;
4278
        }
4279
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4280
    }
4281
    if ( bExp == 0x7FFF ) {
4282
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4283
        return packFloatx80( zSign, 0, 0 );
4284
    }
4285
    if ( bExp == 0 ) {
4286
        if ( bSig == 0 ) {
4287
            if ( ( aExp | aSig ) == 0 ) {
4288
 invalid:
4289
                float_raise( float_flag_invalid STATUS_VAR);
4290
                z.low = floatx80_default_nan_low;
4291
                z.high = floatx80_default_nan_high;
4292
                return z;
4293
            }
4294
            float_raise( float_flag_divbyzero STATUS_VAR);
4295
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4296
        }
4297
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4298
    }
4299
    if ( aExp == 0 ) {
4300
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4301
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4302
    }
4303
    zExp = aExp - bExp + 0x3FFE;
4304
    rem1 = 0;
4305
    if ( bSig <= aSig ) {
4306
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4307
        ++zExp;
4308
    }
4309
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4310
    mul64To128( bSig, zSig0, &term0, &term1 );
4311
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4312
    while ( (sbits64) rem0 < 0 ) {
4313
        --zSig0;
4314
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4315
    }
4316
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4317
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4318
        mul64To128( bSig, zSig1, &term1, &term2 );
4319
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4320
        while ( (sbits64) rem1 < 0 ) {
4321
            --zSig1;
4322
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4323
        }
4324
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4325
    }
4326
    return
4327
        roundAndPackFloatx80(
4328
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4329

    
4330
}
4331

    
4332
/*----------------------------------------------------------------------------
4333
| Returns the remainder of the extended double-precision floating-point value
4334
| `a' with respect to the corresponding value `b'.  The operation is performed
4335
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4336
*----------------------------------------------------------------------------*/
4337

    
4338
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4339
{
4340
    flag aSign, zSign;
4341
    int32 aExp, bExp, expDiff;
4342
    bits64 aSig0, aSig1, bSig;
4343
    bits64 q, term0, term1, alternateASig0, alternateASig1;
4344
    floatx80 z;
4345

    
4346
    aSig0 = extractFloatx80Frac( a );
4347
    aExp = extractFloatx80Exp( a );
4348
    aSign = extractFloatx80Sign( a );
4349
    bSig = extractFloatx80Frac( b );
4350
    bExp = extractFloatx80Exp( b );
4351
    if ( aExp == 0x7FFF ) {
4352
        if (    (bits64) ( aSig0<<1 )
4353
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4354
            return propagateFloatx80NaN( a, b STATUS_VAR );
4355
        }
4356
        goto invalid;
4357
    }
4358
    if ( bExp == 0x7FFF ) {
4359
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4360
        return a;
4361
    }
4362
    if ( bExp == 0 ) {
4363
        if ( bSig == 0 ) {
4364
 invalid:
4365
            float_raise( float_flag_invalid STATUS_VAR);
4366
            z.low = floatx80_default_nan_low;
4367
            z.high = floatx80_default_nan_high;
4368
            return z;
4369
        }
4370
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4371
    }
4372
    if ( aExp == 0 ) {
4373
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4374
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4375
    }
4376
    bSig |= LIT64( 0x8000000000000000 );
4377
    zSign = aSign;
4378
    expDiff = aExp - bExp;
4379
    aSig1 = 0;
4380
    if ( expDiff < 0 ) {
4381
        if ( expDiff < -1 ) return a;
4382
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4383
        expDiff = 0;
4384
    }
4385
    q = ( bSig <= aSig0 );
4386
    if ( q ) aSig0 -= bSig;
4387
    expDiff -= 64;
4388
    while ( 0 < expDiff ) {
4389
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4390
        q = ( 2 < q ) ? q - 2 : 0;
4391
        mul64To128( bSig, q, &term0, &term1 );
4392
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4393
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4394
        expDiff -= 62;
4395
    }
4396
    expDiff += 64;
4397
    if ( 0 < expDiff ) {
4398
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4399
        q = ( 2 < q ) ? q - 2 : 0;
4400
        q >>= 64 - expDiff;
4401
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4402
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4403
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4404
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4405
            ++q;
4406
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4407
        }
4408
    }
4409
    else {
4410
        term1 = 0;
4411
        term0 = bSig;
4412
    }
4413
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4414
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4415
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4416
              && ( q & 1 ) )
4417
       ) {
4418
        aSig0 = alternateASig0;
4419
        aSig1 = alternateASig1;
4420
        zSign = ! zSign;
4421
    }
4422
    return
4423
        normalizeRoundAndPackFloatx80(
4424
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4425

    
4426
}
4427

    
4428
/*----------------------------------------------------------------------------
4429
| Returns the square root of the extended double-precision floating-point
4430
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4431
| for Binary Floating-Point Arithmetic.
4432
*----------------------------------------------------------------------------*/
4433

    
4434
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4435
{
4436
    flag aSign;
4437
    int32 aExp, zExp;
4438
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4439
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4440
    floatx80 z;
4441

    
4442
    aSig0 = extractFloatx80Frac( a );
4443
    aExp = extractFloatx80Exp( a );
4444
    aSign = extractFloatx80Sign( a );
4445
    if ( aExp == 0x7FFF ) {
4446
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4447
        if ( ! aSign ) return a;
4448
        goto invalid;
4449
    }
4450
    if ( aSign ) {
4451
        if ( ( aExp | aSig0 ) == 0 ) return a;
4452
 invalid:
4453
        float_raise( float_flag_invalid STATUS_VAR);
4454
        z.low = floatx80_default_nan_low;
4455
        z.high = floatx80_default_nan_high;
4456
        return z;
4457
    }
4458
    if ( aExp == 0 ) {
4459
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4460
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4461
    }
4462
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4463
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4464
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4465
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4466
    doubleZSig0 = zSig0<<1;
4467
    mul64To128( zSig0, zSig0, &term0, &term1 );
4468
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4469
    while ( (sbits64) rem0 < 0 ) {
4470
        --zSig0;
4471
        doubleZSig0 -= 2;
4472
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4473
    }
4474
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4475
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4476
        if ( zSig1 == 0 ) zSig1 = 1;
4477
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4478
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4479
        mul64To128( zSig1, zSig1, &term2, &term3 );
4480
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4481
        while ( (sbits64) rem1 < 0 ) {
4482
            --zSig1;
4483
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4484
            term3 |= 1;
4485
            term2 |= doubleZSig0;
4486
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4487
        }
4488
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4489
    }
4490
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4491
    zSig0 |= doubleZSig0;
4492
    return
4493
        roundAndPackFloatx80(
4494
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4495

    
4496
}
4497

    
4498
/*----------------------------------------------------------------------------
4499
| Returns 1 if the extended double-precision floating-point value `a' is
4500
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
4501
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4502
| Arithmetic.
4503
*----------------------------------------------------------------------------*/
4504

    
4505
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4506
{
4507

    
4508
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4509
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4510
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4511
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4512
       ) {
4513
        if (    floatx80_is_signaling_nan( a )
4514
             || floatx80_is_signaling_nan( b ) ) {
4515
            float_raise( float_flag_invalid STATUS_VAR);
4516
        }
4517
        return 0;
4518
    }
4519
    return
4520
           ( a.low == b.low )
4521
        && (    ( a.high == b.high )
4522
             || (    ( a.low == 0 )
4523
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4524
           );
4525

    
4526
}
4527

    
4528
/*----------------------------------------------------------------------------
4529
| Returns 1 if the extended double-precision floating-point value `a' is
4530
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4531
| comparison is performed according to the IEC/IEEE Standard for Binary
4532
| Floating-Point Arithmetic.
4533
*----------------------------------------------------------------------------*/
4534

    
4535
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4536
{
4537
    flag aSign, bSign;
4538

    
4539
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4540
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4541
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4542
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4543
       ) {
4544
        float_raise( float_flag_invalid STATUS_VAR);
4545
        return 0;
4546
    }
4547
    aSign = extractFloatx80Sign( a );
4548
    bSign = extractFloatx80Sign( b );
4549
    if ( aSign != bSign ) {
4550
        return
4551
               aSign
4552
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4553
                 == 0 );
4554
    }
4555
    return
4556
          aSign ? le128( b.high, b.low, a.high, a.low )
4557
        : le128( a.high, a.low, b.high, b.low );
4558

    
4559
}
4560

    
4561
/*----------------------------------------------------------------------------
4562
| Returns 1 if the extended double-precision floating-point value `a' is
4563
| less than the corresponding value `b', and 0 otherwise.  The comparison
4564
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4565
| Arithmetic.
4566
*----------------------------------------------------------------------------*/
4567

    
4568
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4569
{
4570
    flag aSign, bSign;
4571

    
4572
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4573
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4574
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4575
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4576
       ) {
4577
        float_raise( float_flag_invalid STATUS_VAR);
4578
        return 0;
4579
    }
4580
    aSign = extractFloatx80Sign( a );
4581
    bSign = extractFloatx80Sign( b );
4582
    if ( aSign != bSign ) {
4583
        return
4584
               aSign
4585
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4586
                 != 0 );
4587
    }
4588
    return
4589
          aSign ? lt128( b.high, b.low, a.high, a.low )
4590
        : lt128( a.high, a.low, b.high, b.low );
4591

    
4592
}
4593

    
4594
/*----------------------------------------------------------------------------
4595
| Returns 1 if the extended double-precision floating-point value `a' is equal
4596
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4597
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4598
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4599
*----------------------------------------------------------------------------*/
4600

    
4601
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4602
{
4603

    
4604
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4605
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4606
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4607
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4608
       ) {
4609
        float_raise( float_flag_invalid STATUS_VAR);
4610
        return 0;
4611
    }
4612
    return
4613
           ( a.low == b.low )
4614
        && (    ( a.high == b.high )
4615
             || (    ( a.low == 0 )
4616
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4617
           );
4618

    
4619
}
4620

    
4621
/*----------------------------------------------------------------------------
4622
| Returns 1 if the extended double-precision floating-point value `a' is less
4623
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4624
| do not cause an exception.  Otherwise, the comparison is performed according
4625
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626
*----------------------------------------------------------------------------*/
4627

    
4628
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4629
{
4630
    flag aSign, bSign;
4631

    
4632
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4633
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4634
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4635
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4636
       ) {
4637
        if (    floatx80_is_signaling_nan( a )
4638
             || floatx80_is_signaling_nan( b ) ) {
4639
            float_raise( float_flag_invalid STATUS_VAR);
4640
        }
4641
        return 0;
4642
    }
4643
    aSign = extractFloatx80Sign( a );
4644
    bSign = extractFloatx80Sign( b );
4645
    if ( aSign != bSign ) {
4646
        return
4647
               aSign
4648
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4649
                 == 0 );
4650
    }
4651
    return
4652
          aSign ? le128( b.high, b.low, a.high, a.low )
4653
        : le128( a.high, a.low, b.high, b.low );
4654

    
4655
}
4656

    
4657
/*----------------------------------------------------------------------------
4658
| Returns 1 if the extended double-precision floating-point value `a' is less
4659
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4660
| an exception.  Otherwise, the comparison is performed according to the
4661
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4662
*----------------------------------------------------------------------------*/
4663

    
4664
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4665
{
4666
    flag aSign, bSign;
4667

    
4668
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4669
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4670
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4671
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4672
       ) {
4673
        if (    floatx80_is_signaling_nan( a )
4674
             || floatx80_is_signaling_nan( b ) ) {
4675
            float_raise( float_flag_invalid STATUS_VAR);
4676
        }
4677
        return 0;
4678
    }
4679
    aSign = extractFloatx80Sign( a );
4680
    bSign = extractFloatx80Sign( b );
4681
    if ( aSign != bSign ) {
4682
        return
4683
               aSign
4684
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4685
                 != 0 );
4686
    }
4687
    return
4688
          aSign ? lt128( b.high, b.low, a.high, a.low )
4689
        : lt128( a.high, a.low, b.high, b.low );
4690

    
4691
}
4692

    
4693
#endif
4694

    
4695
#ifdef FLOAT128
4696

    
4697
/*----------------------------------------------------------------------------
4698
| Returns the result of converting the quadruple-precision floating-point
4699
| value `a' to the 32-bit two's complement integer format.  The conversion
4700
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4701
| Arithmetic---which means in particular that the conversion is rounded
4702
| according to the current rounding mode.  If `a' is a NaN, the largest
4703
| positive integer is returned.  Otherwise, if the conversion overflows, the
4704
| largest integer with the same sign as `a' is returned.
4705
*----------------------------------------------------------------------------*/
4706

    
4707
int32 float128_to_int32( float128 a STATUS_PARAM )
4708
{
4709
    flag aSign;
4710
    int32 aExp, shiftCount;
4711
    bits64 aSig0, aSig1;
4712

    
4713
    aSig1 = extractFloat128Frac1( a );
4714
    aSig0 = extractFloat128Frac0( a );
4715
    aExp = extractFloat128Exp( a );
4716
    aSign = extractFloat128Sign( a );
4717
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4718
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4719
    aSig0 |= ( aSig1 != 0 );
4720
    shiftCount = 0x4028 - aExp;
4721
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4722
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4723

    
4724
}
4725

    
4726
/*----------------------------------------------------------------------------
4727
| Returns the result of converting the quadruple-precision floating-point
4728
| value `a' to the 32-bit two's complement integer format.  The conversion
4729
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4730
| Arithmetic, except that the conversion is always rounded toward zero.  If
4731
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4732
| conversion overflows, the largest integer with the same sign as `a' is
4733
| returned.
4734
*----------------------------------------------------------------------------*/
4735

    
4736
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4737
{
4738
    flag aSign;
4739
    int32 aExp, shiftCount;
4740
    bits64 aSig0, aSig1, savedASig;
4741
    int32 z;
4742

    
4743
    aSig1 = extractFloat128Frac1( a );
4744
    aSig0 = extractFloat128Frac0( a );
4745
    aExp = extractFloat128Exp( a );
4746
    aSign = extractFloat128Sign( a );
4747
    aSig0 |= ( aSig1 != 0 );
4748
    if ( 0x401E < aExp ) {
4749
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4750
        goto invalid;
4751
    }
4752
    else if ( aExp < 0x3FFF ) {
4753
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4754
        return 0;
4755
    }
4756
    aSig0 |= LIT64( 0x0001000000000000 );
4757
    shiftCount = 0x402F - aExp;
4758
    savedASig = aSig0;
4759
    aSig0 >>= shiftCount;
4760
    z = aSig0;
4761
    if ( aSign ) z = - z;
4762
    if ( ( z < 0 ) ^ aSign ) {
4763
 invalid:
4764
        float_raise( float_flag_invalid STATUS_VAR);
4765
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4766
    }
4767
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4768
        STATUS(float_exception_flags) |= float_flag_inexact;
4769
    }
4770
    return z;
4771

    
4772
}
4773

    
4774
/*----------------------------------------------------------------------------
4775
| Returns the result of converting the quadruple-precision floating-point
4776
| value `a' to the 64-bit two's complement integer format.  The conversion
4777
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4778
| Arithmetic---which means in particular that the conversion is rounded
4779
| according to the current rounding mode.  If `a' is a NaN, the largest
4780
| positive integer is returned.  Otherwise, if the conversion overflows, the
4781
| largest integer with the same sign as `a' is returned.
4782
*----------------------------------------------------------------------------*/
4783

    
4784
int64 float128_to_int64( float128 a STATUS_PARAM )
4785
{
4786
    flag aSign;
4787
    int32 aExp, shiftCount;
4788
    bits64 aSig0, aSig1;
4789

    
4790
    aSig1 = extractFloat128Frac1( a );
4791
    aSig0 = extractFloat128Frac0( a );
4792
    aExp = extractFloat128Exp( a );
4793
    aSign = extractFloat128Sign( a );
4794
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4795
    shiftCount = 0x402F - aExp;
4796
    if ( shiftCount <= 0 ) {
4797
        if ( 0x403E < aExp ) {
4798
            float_raise( float_flag_invalid STATUS_VAR);
4799
            if (    ! aSign
4800
                 || (    ( aExp == 0x7FFF )
4801
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4802
                    )
4803
               ) {
4804
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4805
            }
4806
            return (sbits64) LIT64( 0x8000000000000000 );
4807
        }
4808
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4809
    }
4810
    else {
4811
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4812
    }
4813
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4814

    
4815
}
4816

    
4817
/*----------------------------------------------------------------------------
4818
| Returns the result of converting the quadruple-precision floating-point
4819
| value `a' to the 64-bit two's complement integer format.  The conversion
4820
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4821
| Arithmetic, except that the conversion is always rounded toward zero.
4822
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4823
| the conversion overflows, the largest integer with the same sign as `a' is
4824
| returned.
4825
*----------------------------------------------------------------------------*/
4826

    
4827
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4828
{
4829
    flag aSign;
4830
    int32 aExp, shiftCount;
4831
    bits64 aSig0, aSig1;
4832
    int64 z;
4833

    
4834
    aSig1 = extractFloat128Frac1( a );
4835
    aSig0 = extractFloat128Frac0( a );
4836
    aExp = extractFloat128Exp( a );
4837
    aSign = extractFloat128Sign( a );
4838
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4839
    shiftCount = aExp - 0x402F;
4840
    if ( 0 < shiftCount ) {
4841
        if ( 0x403E <= aExp ) {
4842
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4843
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4844
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4845
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4846
            }
4847
            else {
4848
                float_raise( float_flag_invalid STATUS_VAR);
4849
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4850
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4851
                }
4852
            }
4853
            return (sbits64) LIT64( 0x8000000000000000 );
4854
        }
4855
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4856
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4857
            STATUS(float_exception_flags) |= float_flag_inexact;
4858
        }
4859
    }
4860
    else {
4861
        if ( aExp < 0x3FFF ) {
4862
            if ( aExp | aSig0 | aSig1 ) {
4863
                STATUS(float_exception_flags) |= float_flag_inexact;
4864
            }
4865
            return 0;
4866
        }
4867
        z = aSig0>>( - shiftCount );
4868
        if (    aSig1
4869
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4870
            STATUS(float_exception_flags) |= float_flag_inexact;
4871
        }
4872
    }
4873
    if ( aSign ) z = - z;
4874
    return z;
4875

    
4876
}
4877

    
4878
/*----------------------------------------------------------------------------
4879
| Returns the result of converting the quadruple-precision floating-point
4880
| value `a' to the single-precision floating-point format.  The conversion
4881
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4882
| Arithmetic.
4883
*----------------------------------------------------------------------------*/
4884

    
4885
float32 float128_to_float32( float128 a STATUS_PARAM )
4886
{
4887
    flag aSign;
4888
    int32 aExp;
4889
    bits64 aSig0, aSig1;
4890
    bits32 zSig;
4891

    
4892
    aSig1 = extractFloat128Frac1( a );
4893
    aSig0 = extractFloat128Frac0( a );
4894
    aExp = extractFloat128Exp( a );
4895
    aSign = extractFloat128Sign( a );
4896
    if ( aExp == 0x7FFF ) {
4897
        if ( aSig0 | aSig1 ) {
4898
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4899
        }
4900
        return packFloat32( aSign, 0xFF, 0 );
4901
    }
4902
    aSig0 |= ( aSig1 != 0 );
4903
    shift64RightJamming( aSig0, 18, &aSig0 );
4904
    zSig = aSig0;
4905
    if ( aExp || zSig ) {
4906
        zSig |= 0x40000000;
4907
        aExp -= 0x3F81;
4908
    }
4909
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4910

    
4911
}
4912

    
4913
/*----------------------------------------------------------------------------
4914
| Returns the result of converting the quadruple-precision floating-point
4915
| value `a' to the double-precision floating-point format.  The conversion
4916
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4917
| Arithmetic.
4918
*----------------------------------------------------------------------------*/
4919

    
4920
float64 float128_to_float64( float128 a STATUS_PARAM )
4921
{
4922
    flag aSign;
4923
    int32 aExp;
4924
    bits64 aSig0, aSig1;
4925

    
4926
    aSig1 = extractFloat128Frac1( a );
4927
    aSig0 = extractFloat128Frac0( a );
4928
    aExp = extractFloat128Exp( a );
4929
    aSign = extractFloat128Sign( a );
4930
    if ( aExp == 0x7FFF ) {
4931
        if ( aSig0 | aSig1 ) {
4932
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4933
        }
4934
        return packFloat64( aSign, 0x7FF, 0 );
4935
    }
4936
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4937
    aSig0 |= ( aSig1 != 0 );
4938
    if ( aExp || aSig0 ) {
4939
        aSig0 |= LIT64( 0x4000000000000000 );
4940
        aExp -= 0x3C01;
4941
    }
4942
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4943

    
4944
}
4945

    
4946
#ifdef FLOATX80
4947

    
4948
/*----------------------------------------------------------------------------
4949
| Returns the result of converting the quadruple-precision floating-point
4950
| value `a' to the extended double-precision floating-point format.  The
4951
| conversion is performed according to the IEC/IEEE Standard for Binary
4952
| Floating-Point Arithmetic.
4953
*----------------------------------------------------------------------------*/
4954

    
4955
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4956
{
4957
    flag aSign;
4958
    int32 aExp;
4959
    bits64 aSig0, aSig1;
4960

    
4961
    aSig1 = extractFloat128Frac1( a );
4962
    aSig0 = extractFloat128Frac0( a );
4963
    aExp = extractFloat128Exp( a );
4964
    aSign = extractFloat128Sign( a );
4965
    if ( aExp == 0x7FFF ) {
4966
        if ( aSig0 | aSig1 ) {
4967
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4968
        }
4969
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4970
    }
4971
    if ( aExp == 0 ) {
4972
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4973
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4974
    }
4975
    else {
4976
        aSig0 |= LIT64( 0x0001000000000000 );
4977
    }
4978
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4979
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4980

    
4981
}
4982

    
4983
#endif
4984

    
4985
/*----------------------------------------------------------------------------
4986
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4987
| returns the result as a quadruple-precision floating-point value.  The
4988
| operation is performed according to the IEC/IEEE Standard for Binary
4989
| Floating-Point Arithmetic.
4990
*----------------------------------------------------------------------------*/
4991

    
4992
float128 float128_round_to_int( float128 a STATUS_PARAM )
4993
{
4994
    flag aSign;
4995
    int32 aExp;
4996
    bits64 lastBitMask, roundBitsMask;
4997
    int8 roundingMode;
4998
    float128 z;
4999

    
5000
    aExp = extractFloat128Exp( a );
5001
    if ( 0x402F <= aExp ) {
5002
        if ( 0x406F <= aExp ) {
5003
            if (    ( aExp == 0x7FFF )
5004
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5005
               ) {
5006
                return propagateFloat128NaN( a, a STATUS_VAR );
5007
            }
5008
            return a;
5009
        }
5010
        lastBitMask = 1;
5011
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5012
        roundBitsMask = lastBitMask - 1;
5013
        z = a;
5014
        roundingMode = STATUS(float_rounding_mode);
5015
        if ( roundingMode == float_round_nearest_even ) {
5016
            if ( lastBitMask ) {
5017
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5018
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5019
            }
5020
            else {
5021
                if ( (sbits64) z.low < 0 ) {
5022
                    ++z.high;
5023
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5024
                }
5025
            }
5026
        }
5027
        else if ( roundingMode != float_round_to_zero ) {
5028
            if (   extractFloat128Sign( z )
5029
                 ^ ( roundingMode == float_round_up ) ) {
5030
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5031
            }
5032
        }
5033
        z.low &= ~ roundBitsMask;
5034
    }
5035
    else {
5036
        if ( aExp < 0x3FFF ) {
5037
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5038
            STATUS(float_exception_flags) |= float_flag_inexact;
5039
            aSign = extractFloat128Sign( a );
5040
            switch ( STATUS(float_rounding_mode) ) {
5041
             case float_round_nearest_even:
5042
                if (    ( aExp == 0x3FFE )
5043
                     && (   extractFloat128Frac0( a )
5044
                          | extractFloat128Frac1( a ) )
5045
                   ) {
5046
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
5047
                }
5048
                break;
5049
             case float_round_down:
5050
                return
5051
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5052
                    : packFloat128( 0, 0, 0, 0 );
5053
             case float_round_up:
5054
                return
5055
                      aSign ? packFloat128( 1, 0, 0, 0 )
5056
                    : packFloat128( 0, 0x3FFF, 0, 0 );
5057
            }
5058
            return packFloat128( aSign, 0, 0, 0 );
5059
        }
5060
        lastBitMask = 1;
5061
        lastBitMask <<= 0x402F - aExp;
5062
        roundBitsMask = lastBitMask - 1;
5063
        z.low = 0;
5064
        z.high = a.high;
5065
        roundingMode = STATUS(float_rounding_mode);
5066
        if ( roundingMode == float_round_nearest_even ) {
5067
            z.high += lastBitMask>>1;
5068
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5069
                z.high &= ~ lastBitMask;
5070
            }
5071
        }
5072
        else if ( roundingMode != float_round_to_zero ) {
5073
            if (   extractFloat128Sign( z )
5074
                 ^ ( roundingMode == float_round_up ) ) {
5075
                z.high |= ( a.low != 0 );
5076
                z.high += roundBitsMask;
5077
            }
5078
        }
5079
        z.high &= ~ roundBitsMask;
5080
    }
5081
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5082
        STATUS(float_exception_flags) |= float_flag_inexact;
5083
    }
5084
    return z;
5085

    
5086
}
5087

    
5088
/*----------------------------------------------------------------------------
5089
| Returns the result of adding the absolute values of the quadruple-precision
5090
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5091
| before being returned.  `zSign' is ignored if the result is a NaN.
5092
| The addition is performed according to the IEC/IEEE Standard for Binary
5093
| Floating-Point Arithmetic.
5094
*----------------------------------------------------------------------------*/
5095

    
5096
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5097
{
5098
    int32 aExp, bExp, zExp;
5099
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5100
    int32 expDiff;
5101

    
5102
    aSig1 = extractFloat128Frac1( a );
5103
    aSig0 = extractFloat128Frac0( a );
5104
    aExp = extractFloat128Exp( a );
5105
    bSig1 = extractFloat128Frac1( b );
5106
    bSig0 = extractFloat128Frac0( b );
5107
    bExp = extractFloat128Exp( b );
5108
    expDiff = aExp - bExp;
5109
    if ( 0 < expDiff ) {
5110
        if ( aExp == 0x7FFF ) {
5111
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5112
            return a;
5113
        }
5114
        if ( bExp == 0 ) {
5115
            --expDiff;
5116
        }
5117
        else {
5118
            bSig0 |= LIT64( 0x0001000000000000 );
5119
        }
5120
        shift128ExtraRightJamming(
5121
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5122
        zExp = aExp;
5123
    }
5124
    else if ( expDiff < 0 ) {
5125
        if ( bExp == 0x7FFF ) {
5126
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5127
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5128
        }
5129
        if ( aExp == 0 ) {
5130
            ++expDiff;
5131
        }
5132
        else {
5133
            aSig0 |= LIT64( 0x0001000000000000 );
5134
        }
5135
        shift128ExtraRightJamming(
5136
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5137
        zExp = bExp;
5138
    }
5139
    else {
5140
        if ( aExp == 0x7FFF ) {
5141
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5142
                return propagateFloat128NaN( a, b STATUS_VAR );
5143
            }
5144
            return a;
5145
        }
5146
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5147
        if ( aExp == 0 ) {
5148
            if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5149
            return packFloat128( zSign, 0, zSig0, zSig1 );
5150
        }
5151
        zSig2 = 0;
5152
        zSig0 |= LIT64( 0x0002000000000000 );
5153
        zExp = aExp;
5154
        goto shiftRight1;
5155
    }
5156
    aSig0 |= LIT64( 0x0001000000000000 );
5157
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5158
    --zExp;
5159
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5160
    ++zExp;
5161
 shiftRight1:
5162
    shift128ExtraRightJamming(
5163
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5164
 roundAndPack:
5165
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5166

    
5167
}
5168

    
5169
/*----------------------------------------------------------------------------
5170
| Returns the result of subtracting the absolute values of the quadruple-
5171
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
5172
| difference is negated before being returned.  `zSign' is ignored if the
5173
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
5174
| Standard for Binary Floating-Point Arithmetic.
5175
*----------------------------------------------------------------------------*/
5176

    
5177
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5178
{
5179
    int32 aExp, bExp, zExp;
5180
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5181
    int32 expDiff;
5182
    float128 z;
5183

    
5184
    aSig1 = extractFloat128Frac1( a );
5185
    aSig0 = extractFloat128Frac0( a );
5186
    aExp = extractFloat128Exp( a );
5187
    bSig1 = extractFloat128Frac1( b );
5188
    bSig0 = extractFloat128Frac0( b );
5189
    bExp = extractFloat128Exp( b );
5190
    expDiff = aExp - bExp;
5191
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5192
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5193
    if ( 0 < expDiff ) goto aExpBigger;
5194
    if ( expDiff < 0 ) goto bExpBigger;
5195
    if ( aExp == 0x7FFF ) {
5196
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5197
            return propagateFloat128NaN( a, b STATUS_VAR );
5198
        }
5199
        float_raise( float_flag_invalid STATUS_VAR);
5200
        z.low = float128_default_nan_low;
5201
        z.high = float128_default_nan_high;
5202
        return z;
5203
    }
5204
    if ( aExp == 0 ) {
5205
        aExp = 1;
5206
        bExp = 1;
5207
    }
5208
    if ( bSig0 < aSig0 ) goto aBigger;
5209
    if ( aSig0 < bSig0 ) goto bBigger;
5210
    if ( bSig1 < aSig1 ) goto aBigger;
5211
    if ( aSig1 < bSig1 ) goto bBigger;
5212
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5213
 bExpBigger:
5214
    if ( bExp == 0x7FFF ) {
5215
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5216
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5217
    }
5218
    if ( aExp == 0 ) {
5219
        ++expDiff;
5220
    }
5221
    else {
5222
        aSig0 |= LIT64( 0x4000000000000000 );
5223
    }
5224
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5225
    bSig0 |= LIT64( 0x4000000000000000 );
5226
 bBigger:
5227
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5228
    zExp = bExp;
5229
    zSign ^= 1;
5230
    goto normalizeRoundAndPack;
5231
 aExpBigger:
5232
    if ( aExp == 0x7FFF ) {
5233
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5234
        return a;
5235
    }
5236
    if ( bExp == 0 ) {
5237
        --expDiff;
5238
    }
5239
    else {
5240
        bSig0 |= LIT64( 0x4000000000000000 );
5241
    }
5242
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5243
    aSig0 |= LIT64( 0x4000000000000000 );
5244
 aBigger:
5245
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5246
    zExp = aExp;
5247
 normalizeRoundAndPack:
5248
    --zExp;
5249
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5250

    
5251
}
5252

    
5253
/*----------------------------------------------------------------------------
5254
| Returns the result of adding the quadruple-precision floating-point values
5255
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5256
| for Binary Floating-Point Arithmetic.
5257
*----------------------------------------------------------------------------*/
5258

    
5259
float128 float128_add( float128 a, float128 b STATUS_PARAM )
5260
{
5261
    flag aSign, bSign;
5262

    
5263
    aSign = extractFloat128Sign( a );
5264
    bSign = extractFloat128Sign( b );
5265
    if ( aSign == bSign ) {
5266
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5267
    }
5268
    else {
5269
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5270
    }
5271

    
5272
}
5273

    
5274
/*----------------------------------------------------------------------------
5275
| Returns the result of subtracting the quadruple-precision floating-point
5276
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5277
| Standard for Binary Floating-Point Arithmetic.
5278
*----------------------------------------------------------------------------*/
5279

    
5280
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5281
{
5282
    flag aSign, bSign;
5283

    
5284
    aSign = extractFloat128Sign( a );
5285
    bSign = extractFloat128Sign( b );
5286
    if ( aSign == bSign ) {
5287
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
5288
    }
5289
    else {
5290
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
5291
    }
5292

    
5293
}
5294

    
5295
/*----------------------------------------------------------------------------
5296
| Returns the result of multiplying the quadruple-precision floating-point
5297
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
5298
| Standard for Binary Floating-Point Arithmetic.
5299
*----------------------------------------------------------------------------*/
5300

    
5301
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5302
{
5303
    flag aSign, bSign, zSign;
5304
    int32 aExp, bExp, zExp;
5305
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5306
    float128 z;
5307

    
5308
    aSig1 = extractFloat128Frac1( a );
5309
    aSig0 = extractFloat128Frac0( a );
5310
    aExp = extractFloat128Exp( a );
5311
    aSign = extractFloat128Sign( a );
5312
    bSig1 = extractFloat128Frac1( b );
5313
    bSig0 = extractFloat128Frac0( b );
5314
    bExp = extractFloat128Exp( b );
5315
    bSign = extractFloat128Sign( b );
5316
    zSign = aSign ^ bSign;
5317
    if ( aExp == 0x7FFF ) {
5318
        if (    ( aSig0 | aSig1 )
5319
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5320
            return propagateFloat128NaN( a, b STATUS_VAR );
5321
        }
5322
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5323
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5324
    }
5325
    if ( bExp == 0x7FFF ) {
5326
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5327
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5328
 invalid:
5329
            float_raise( float_flag_invalid STATUS_VAR);
5330
            z.low = float128_default_nan_low;
5331
            z.high = float128_default_nan_high;
5332
            return z;
5333
        }
5334
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5335
    }
5336
    if ( aExp == 0 ) {
5337
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5338
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5339
    }
5340
    if ( bExp == 0 ) {
5341
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5342
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5343
    }
5344
    zExp = aExp + bExp - 0x4000;
5345
    aSig0 |= LIT64( 0x0001000000000000 );
5346
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5347
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5348
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5349
    zSig2 |= ( zSig3 != 0 );
5350
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5351
        shift128ExtraRightJamming(
5352
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5353
        ++zExp;
5354
    }
5355
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5356

    
5357
}
5358

    
5359
/*----------------------------------------------------------------------------
5360
| Returns the result of dividing the quadruple-precision floating-point value
5361
| `a' by the corresponding value `b'.  The operation is performed according to
5362
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5363
*----------------------------------------------------------------------------*/
5364

    
5365
float128 float128_div( float128 a, float128 b STATUS_PARAM )
5366
{
5367
    flag aSign, bSign, zSign;
5368
    int32 aExp, bExp, zExp;
5369
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5370
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5371
    float128 z;
5372

    
5373
    aSig1 = extractFloat128Frac1( a );
5374
    aSig0 = extractFloat128Frac0( a );
5375
    aExp = extractFloat128Exp( a );
5376
    aSign = extractFloat128Sign( a );
5377
    bSig1 = extractFloat128Frac1( b );
5378
    bSig0 = extractFloat128Frac0( b );
5379
    bExp = extractFloat128Exp( b );
5380
    bSign = extractFloat128Sign( b );
5381
    zSign = aSign ^ bSign;
5382
    if ( aExp == 0x7FFF ) {
5383
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5384
        if ( bExp == 0x7FFF ) {
5385
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5386
            goto invalid;
5387
        }
5388
        return packFloat128( zSign, 0x7FFF, 0, 0 );
5389
    }
5390
    if ( bExp == 0x7FFF ) {
5391
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5392
        return packFloat128( zSign, 0, 0, 0 );
5393
    }
5394
    if ( bExp == 0 ) {
5395
        if ( ( bSig0 | bSig1 ) == 0 ) {
5396
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5397
 invalid:
5398
                float_raise( float_flag_invalid STATUS_VAR);
5399
                z.low = float128_default_nan_low;
5400
                z.high = float128_default_nan_high;
5401
                return z;
5402
            }
5403
            float_raise( float_flag_divbyzero STATUS_VAR);
5404
            return packFloat128( zSign, 0x7FFF, 0, 0 );
5405
        }
5406
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5407
    }
5408
    if ( aExp == 0 ) {
5409
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5410
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5411
    }
5412
    zExp = aExp - bExp + 0x3FFD;
5413
    shortShift128Left(
5414
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5415
    shortShift128Left(
5416
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5417
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5418
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5419
        ++zExp;
5420
    }
5421
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5422
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5423
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5424
    while ( (sbits64) rem0 < 0 ) {
5425
        --zSig0;
5426
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5427
    }
5428
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5429
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5430
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5431
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5432
        while ( (sbits64) rem1 < 0 ) {
5433
            --zSig1;
5434
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5435
        }
5436
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5437
    }
5438
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5439
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5440

    
5441
}
5442

    
5443
/*----------------------------------------------------------------------------
5444
| Returns the remainder of the quadruple-precision floating-point value `a'
5445
| with respect to the corresponding value `b'.  The operation is performed
5446
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5447
*----------------------------------------------------------------------------*/
5448

    
5449
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5450
{
5451
    flag aSign, zSign;
5452
    int32 aExp, bExp, expDiff;
5453
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5454
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5455
    sbits64 sigMean0;
5456
    float128 z;
5457

    
5458
    aSig1 = extractFloat128Frac1( a );
5459
    aSig0 = extractFloat128Frac0( a );
5460
    aExp = extractFloat128Exp( a );
5461
    aSign = extractFloat128Sign( a );
5462
    bSig1 = extractFloat128Frac1( b );
5463
    bSig0 = extractFloat128Frac0( b );
5464
    bExp = extractFloat128Exp( b );
5465
    if ( aExp == 0x7FFF ) {
5466
        if (    ( aSig0 | aSig1 )
5467
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5468
            return propagateFloat128NaN( a, b STATUS_VAR );
5469
        }
5470
        goto invalid;
5471
    }
5472
    if ( bExp == 0x7FFF ) {
5473
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5474
        return a;
5475
    }
5476
    if ( bExp == 0 ) {
5477
        if ( ( bSig0 | bSig1 ) == 0 ) {
5478
 invalid:
5479
            float_raise( float_flag_invalid STATUS_VAR);
5480
            z.low = float128_default_nan_low;
5481
            z.high = float128_default_nan_high;
5482
            return z;
5483
        }
5484
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5485
    }
5486
    if ( aExp == 0 ) {
5487
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
5488
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5489
    }
5490
    expDiff = aExp - bExp;
5491
    if ( expDiff < -1 ) return a;
5492
    shortShift128Left(
5493
        aSig0 | LIT64( 0x0001000000000000 ),
5494
        aSig1,
5495
        15 - ( expDiff < 0 ),
5496
        &aSig0,
5497
        &aSig1
5498
    );
5499
    shortShift128Left(
5500
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5501
    q = le128( bSig0, bSig1, aSig0, aSig1 );
5502
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5503
    expDiff -= 64;
5504
    while ( 0 < expDiff ) {
5505
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5506
        q = ( 4 < q ) ? q - 4 : 0;
5507
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5508
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5509
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5510
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5511
        expDiff -= 61;
5512
    }
5513
    if ( -64 < expDiff ) {
5514
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5515
        q = ( 4 < q ) ? q - 4 : 0;
5516
        q >>= - expDiff;
5517
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5518
        expDiff += 52;
5519
        if ( expDiff < 0 ) {
5520
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5521
        }
5522
        else {
5523
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5524
        }
5525
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5526
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5527
    }
5528
    else {
5529
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5530
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5531
    }
5532
    do {
5533
        alternateASig0 = aSig0;
5534
        alternateASig1 = aSig1;
5535
        ++q;
5536
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5537
    } while ( 0 <= (sbits64) aSig0 );
5538
    add128(
5539
        aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5540
    if (    ( sigMean0 < 0 )
5541
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5542
        aSig0 = alternateASig0;
5543
        aSig1 = alternateASig1;
5544
    }
5545
    zSign = ( (sbits64) aSig0 < 0 );
5546
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5547
    return
5548
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5549

    
5550
}
5551

    
5552
/*----------------------------------------------------------------------------
5553
| Returns the square root of the quadruple-precision floating-point value `a'.
5554
| The operation is performed according to the IEC/IEEE Standard for Binary
5555
| Floating-Point Arithmetic.
5556
*----------------------------------------------------------------------------*/
5557

    
5558
float128 float128_sqrt( float128 a STATUS_PARAM )
5559
{
5560
    flag aSign;
5561
    int32 aExp, zExp;
5562
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5563
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5564
    float128 z;
5565

    
5566
    aSig1 = extractFloat128Frac1( a );
5567
    aSig0 = extractFloat128Frac0( a );
5568
    aExp = extractFloat128Exp( a );
5569
    aSign = extractFloat128Sign( a );
5570
    if ( aExp == 0x7FFF ) {
5571
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5572
        if ( ! aSign ) return a;
5573
        goto invalid;
5574
    }
5575
    if ( aSign ) {
5576
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5577
 invalid:
5578
        float_raise( float_flag_invalid STATUS_VAR);
5579
        z.low = float128_default_nan_low;
5580
        z.high = float128_default_nan_high;
5581
        return z;
5582
    }
5583
    if ( aExp == 0 ) {
5584
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5585
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5586
    }
5587
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5588
    aSig0 |= LIT64( 0x0001000000000000 );
5589
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5590
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5591
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5592
    doubleZSig0 = zSig0<<1;
5593
    mul64To128( zSig0, zSig0, &term0, &term1 );
5594
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5595
    while ( (sbits64) rem0 < 0 ) {
5596
        --zSig0;
5597
        doubleZSig0 -= 2;
5598
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5599
    }
5600
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5601
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5602
        if ( zSig1 == 0 ) zSig1 = 1;
5603
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5604
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5605
        mul64To128( zSig1, zSig1, &term2, &term3 );
5606
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5607
        while ( (sbits64) rem1 < 0 ) {
5608
            --zSig1;
5609
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5610
            term3 |= 1;
5611
            term2 |= doubleZSig0;
5612
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5613
        }
5614
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5615
    }
5616
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5617
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5618

    
5619
}
5620

    
5621
/*----------------------------------------------------------------------------
5622
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5623
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5624
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5625
*----------------------------------------------------------------------------*/
5626

    
5627
int float128_eq( float128 a, float128 b STATUS_PARAM )
5628
{
5629

    
5630
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5631
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5632
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5633
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5634
       ) {
5635
        if (    float128_is_signaling_nan( a )
5636
             || float128_is_signaling_nan( b ) ) {
5637
            float_raise( float_flag_invalid STATUS_VAR);
5638
        }
5639
        return 0;
5640
    }
5641
    return
5642
           ( a.low == b.low )
5643
        && (    ( a.high == b.high )
5644
             || (    ( a.low == 0 )
5645
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5646
           );
5647

    
5648
}
5649

    
5650
/*----------------------------------------------------------------------------
5651
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5652
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5653
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5654
| Arithmetic.
5655
*----------------------------------------------------------------------------*/
5656

    
5657
int float128_le( float128 a, float128 b STATUS_PARAM )
5658
{
5659
    flag aSign, bSign;
5660

    
5661
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5662
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5664
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5665
       ) {
5666
        float_raise( float_flag_invalid STATUS_VAR);
5667
        return 0;
5668
    }
5669
    aSign = extractFloat128Sign( a );
5670
    bSign = extractFloat128Sign( b );
5671
    if ( aSign != bSign ) {
5672
        return
5673
               aSign
5674
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5675
                 == 0 );
5676
    }
5677
    return
5678
          aSign ? le128( b.high, b.low, a.high, a.low )
5679
        : le128( a.high, a.low, b.high, b.low );
5680

    
5681
}
5682

    
5683
/*----------------------------------------------------------------------------
5684
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5685
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5686
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5687
*----------------------------------------------------------------------------*/
5688

    
5689
int float128_lt( float128 a, float128 b STATUS_PARAM )
5690
{
5691
    flag aSign, bSign;
5692

    
5693
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5694
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5695
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5696
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5697
       ) {
5698
        float_raise( float_flag_invalid STATUS_VAR);
5699
        return 0;
5700
    }
5701
    aSign = extractFloat128Sign( a );
5702
    bSign = extractFloat128Sign( b );
5703
    if ( aSign != bSign ) {
5704
        return
5705
               aSign
5706
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5707
                 != 0 );
5708
    }
5709
    return
5710
          aSign ? lt128( b.high, b.low, a.high, a.low )
5711
        : lt128( a.high, a.low, b.high, b.low );
5712

    
5713
}
5714

    
5715
/*----------------------------------------------------------------------------
5716
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5717
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5718
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5719
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5720
*----------------------------------------------------------------------------*/
5721

    
5722
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5723
{
5724

    
5725
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5726
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5727
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5728
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5729
       ) {
5730
        float_raise( float_flag_invalid STATUS_VAR);
5731
        return 0;
5732
    }
5733
    return
5734
           ( a.low == b.low )
5735
        && (    ( a.high == b.high )
5736
             || (    ( a.low == 0 )
5737
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5738
           );
5739

    
5740
}
5741

    
5742
/*----------------------------------------------------------------------------
5743
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5744
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5745
| cause an exception.  Otherwise, the comparison is performed according to the
5746
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5747
*----------------------------------------------------------------------------*/
5748

    
5749
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5750
{
5751
    flag aSign, bSign;
5752

    
5753
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5754
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5755
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5756
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5757
       ) {
5758
        if (    float128_is_signaling_nan( a )
5759
             || float128_is_signaling_nan( b ) ) {
5760
            float_raise( float_flag_invalid STATUS_VAR);
5761
        }
5762
        return 0;
5763
    }
5764
    aSign = extractFloat128Sign( a );
5765
    bSign = extractFloat128Sign( b );
5766
    if ( aSign != bSign ) {
5767
        return
5768
               aSign
5769
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5770
                 == 0 );
5771
    }
5772
    return
5773
          aSign ? le128( b.high, b.low, a.high, a.low )
5774
        : le128( a.high, a.low, b.high, b.low );
5775

    
5776
}
5777

    
5778
/*----------------------------------------------------------------------------
5779
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5780
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5781
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5782
| Standard for Binary Floating-Point Arithmetic.
5783
*----------------------------------------------------------------------------*/
5784

    
5785
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5786
{
5787
    flag aSign, bSign;
5788

    
5789
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5790
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5791
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5792
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5793
       ) {
5794
        if (    float128_is_signaling_nan( a )
5795
             || float128_is_signaling_nan( b ) ) {
5796
            float_raise( float_flag_invalid STATUS_VAR);
5797
        }
5798
        return 0;
5799
    }
5800
    aSign = extractFloat128Sign( a );
5801
    bSign = extractFloat128Sign( b );
5802
    if ( aSign != bSign ) {
5803
        return
5804
               aSign
5805
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5806
                 != 0 );
5807
    }
5808
    return
5809
          aSign ? lt128( b.high, b.low, a.high, a.low )
5810
        : lt128( a.high, a.low, b.high, b.low );
5811

    
5812
}
5813

    
5814
#endif
5815

    
5816
/* misc functions */
5817
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5818
{
5819
    return int64_to_float32(a STATUS_VAR);
5820
}
5821

    
5822
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5823
{
5824
    return int64_to_float64(a STATUS_VAR);
5825
}
5826

    
5827
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5828
{
5829
    int64_t v;
5830
    unsigned int res;
5831

    
5832
    v = float32_to_int64(a STATUS_VAR);
5833
    if (v < 0) {
5834
        res = 0;
5835
        float_raise( float_flag_invalid STATUS_VAR);
5836
    } else if (v > 0xffffffff) {
5837
        res = 0xffffffff;
5838
        float_raise( float_flag_invalid STATUS_VAR);
5839
    } else {
5840
        res = v;
5841
    }
5842
    return res;
5843
}
5844

    
5845
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5846
{
5847
    int64_t v;
5848
    unsigned int res;
5849

    
5850
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5851
    if (v < 0) {
5852
        res = 0;
5853
        float_raise( float_flag_invalid STATUS_VAR);
5854
    } else if (v > 0xffffffff) {
5855
        res = 0xffffffff;
5856
        float_raise( float_flag_invalid STATUS_VAR);
5857
    } else {
5858
        res = v;
5859
    }
5860
    return res;
5861
}
5862

    
5863
unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5864
{
5865
    int64_t v;
5866
    unsigned int res;
5867

    
5868
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5869
    if (v < 0) {
5870
        res = 0;
5871
        float_raise( float_flag_invalid STATUS_VAR);
5872
    } else if (v > 0xffff) {
5873
        res = 0xffff;
5874
        float_raise( float_flag_invalid STATUS_VAR);
5875
    } else {
5876
        res = v;
5877
    }
5878
    return res;
5879
}
5880

    
5881
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5882
{
5883
    int64_t v;
5884
    unsigned int res;
5885

    
5886
    v = float64_to_int64(a STATUS_VAR);
5887
    if (v < 0) {
5888
        res = 0;
5889
        float_raise( float_flag_invalid STATUS_VAR);
5890
    } else if (v > 0xffffffff) {
5891
        res = 0xffffffff;
5892
        float_raise( float_flag_invalid STATUS_VAR);
5893
    } else {
5894
        res = v;
5895
    }
5896
    return res;
5897
}
5898

    
5899
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5900
{
5901
    int64_t v;
5902
    unsigned int res;
5903

    
5904
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5905
    if (v < 0) {
5906
        res = 0;
5907
        float_raise( float_flag_invalid STATUS_VAR);
5908
    } else if (v > 0xffffffff) {
5909
        res = 0xffffffff;
5910
        float_raise( float_flag_invalid STATUS_VAR);
5911
    } else {
5912
        res = v;
5913
    }
5914
    return res;
5915
}
5916

    
5917
unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5918
{
5919
    int64_t v;
5920
    unsigned int res;
5921

    
5922
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5923
    if (v < 0) {
5924
        res = 0;
5925
        float_raise( float_flag_invalid STATUS_VAR);
5926
    } else if (v > 0xffff) {
5927
        res = 0xffff;
5928
        float_raise( float_flag_invalid STATUS_VAR);
5929
    } else {
5930
        res = v;
5931
    }
5932
    return res;
5933
}
5934

    
5935
/* FIXME: This looks broken.  */
5936
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5937
{
5938
    int64_t v;
5939

    
5940
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5941
    v += float64_val(a);
5942
    v = float64_to_int64(make_float64(v) STATUS_VAR);
5943

    
5944
    return v - INT64_MIN;
5945
}
5946

    
5947
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5948
{
5949
    int64_t v;
5950

    
5951
    v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5952
    v += float64_val(a);
5953
    v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5954

    
5955
    return v - INT64_MIN;
5956
}
5957

    
5958
#define COMPARE(s, nan_exp)                                                  \
5959
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5960
                                      int is_quiet STATUS_PARAM )            \
5961
{                                                                            \
5962
    flag aSign, bSign;                                                       \
5963
    bits ## s av, bv;                                                        \
5964
    a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5965
    b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5966
                                                                             \
5967
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5968
         extractFloat ## s ## Frac( a ) ) ||                                 \
5969
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5970
          extractFloat ## s ## Frac( b ) )) {                                \
5971
        if (!is_quiet ||                                                     \
5972
            float ## s ## _is_signaling_nan( a ) ||                          \
5973
            float ## s ## _is_signaling_nan( b ) ) {                         \
5974
            float_raise( float_flag_invalid STATUS_VAR);                     \
5975
        }                                                                    \
5976
        return float_relation_unordered;                                     \
5977
    }                                                                        \
5978
    aSign = extractFloat ## s ## Sign( a );                                  \
5979
    bSign = extractFloat ## s ## Sign( b );                                  \
5980
    av = float ## s ## _val(a);                                              \
5981
    bv = float ## s ## _val(b);                                              \
5982
    if ( aSign != bSign ) {                                                  \
5983
        if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5984
            /* zero case */                                                  \
5985
            return float_relation_equal;                                     \
5986
        } else {                                                             \
5987
            return 1 - (2 * aSign);                                          \
5988
        }                                                                    \
5989
    } else {                                                                 \
5990
        if (av == bv) {                                                      \
5991
            return float_relation_equal;                                     \
5992
        } else {                                                             \
5993
            return 1 - 2 * (aSign ^ ( av < bv ));                            \
5994
        }                                                                    \
5995
    }                                                                        \
5996
}                                                                            \
5997
                                                                             \
5998
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5999
{                                                                            \
6000
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6001
}                                                                            \
6002
                                                                             \
6003
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6004
{                                                                            \
6005
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6006
}
6007

    
6008
COMPARE(32, 0xff)
6009
COMPARE(64, 0x7ff)
6010

    
6011
INLINE int float128_compare_internal( float128 a, float128 b,
6012
                                      int is_quiet STATUS_PARAM )
6013
{
6014
    flag aSign, bSign;
6015

    
6016
    if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6017
          ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6018
        ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6019
          ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6020
        if (!is_quiet ||
6021
            float128_is_signaling_nan( a ) ||
6022
            float128_is_signaling_nan( b ) ) {
6023
            float_raise( float_flag_invalid STATUS_VAR);
6024
        }
6025
        return float_relation_unordered;
6026
    }
6027
    aSign = extractFloat128Sign( a );
6028
    bSign = extractFloat128Sign( b );
6029
    if ( aSign != bSign ) {
6030
        if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6031
            /* zero case */
6032
            return float_relation_equal;
6033
        } else {
6034
            return 1 - (2 * aSign);
6035
        }
6036
    } else {
6037
        if (a.low == b.low && a.high == b.high) {
6038
            return float_relation_equal;
6039
        } else {
6040
            return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6041
        }
6042
    }
6043
}
6044

    
6045
int float128_compare( float128 a, float128 b STATUS_PARAM )
6046
{
6047
    return float128_compare_internal(a, b, 0 STATUS_VAR);
6048
}
6049

    
6050
int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6051
{
6052
    return float128_compare_internal(a, b, 1 STATUS_VAR);
6053
}
6054

    
6055
/* Multiply A by 2 raised to the power N.  */
6056
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6057
{
6058
    flag aSign;
6059
    int16 aExp;
6060
    bits32 aSig;
6061

    
6062
    a = float32_squash_input_denormal(a STATUS_VAR);
6063
    aSig = extractFloat32Frac( a );
6064
    aExp = extractFloat32Exp( a );
6065
    aSign = extractFloat32Sign( a );
6066

    
6067
    if ( aExp == 0xFF ) {
6068
        return a;
6069
    }
6070
    if ( aExp != 0 )
6071
        aSig |= 0x00800000;
6072
    else if ( aSig == 0 )
6073
        return a;
6074

    
6075
    aExp += n - 1;
6076
    aSig <<= 7;
6077
    return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6078
}
6079

    
6080
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6081
{
6082
    flag aSign;
6083
    int16 aExp;
6084
    bits64 aSig;
6085

    
6086
    a = float64_squash_input_denormal(a STATUS_VAR);
6087
    aSig = extractFloat64Frac( a );
6088
    aExp = extractFloat64Exp( a );
6089
    aSign = extractFloat64Sign( a );
6090

    
6091
    if ( aExp == 0x7FF ) {
6092
        return a;
6093
    }
6094
    if ( aExp != 0 )
6095
        aSig |= LIT64( 0x0010000000000000 );
6096
    else if ( aSig == 0 )
6097
        return a;
6098

    
6099
    aExp += n - 1;
6100
    aSig <<= 10;
6101
    return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6102
}
6103

    
6104
#ifdef FLOATX80
6105
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6106
{
6107
    flag aSign;
6108
    int16 aExp;
6109
    bits64 aSig;
6110

    
6111
    aSig = extractFloatx80Frac( a );
6112
    aExp = extractFloatx80Exp( a );
6113
    aSign = extractFloatx80Sign( a );
6114

    
6115
    if ( aExp == 0x7FF ) {
6116
        return a;
6117
    }
6118
    if (aExp == 0 && aSig == 0)
6119
        return a;
6120

    
6121
    aExp += n;
6122
    return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6123
                                          aSign, aExp, aSig, 0 STATUS_VAR );
6124
}
6125
#endif
6126

    
6127
#ifdef FLOAT128
6128
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6129
{
6130
    flag aSign;
6131
    int32 aExp;
6132
    bits64 aSig0, aSig1;
6133

    
6134
    aSig1 = extractFloat128Frac1( a );
6135
    aSig0 = extractFloat128Frac0( a );
6136
    aExp = extractFloat128Exp( a );
6137
    aSign = extractFloat128Sign( a );
6138
    if ( aExp == 0x7FFF ) {
6139
        return a;
6140
    }
6141
    if ( aExp != 0 )
6142
        aSig0 |= LIT64( 0x0001000000000000 );
6143
    else if ( aSig0 == 0 && aSig1 == 0 )
6144
        return a;
6145

    
6146
    aExp += n - 1;
6147
    return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6148
                                          STATUS_VAR );
6149

    
6150
}
6151
#endif