Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ e6afc87f

History | View | Annotate | Download (228.9 kB)

1
/*
2
 * QEMU float support
3
 *
4
 * Derived from SoftFloat.
5
 */
6

    
7
/*============================================================================
8

9
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10
Package, Release 2b.
11

12
Written by John R. Hauser.  This work was made possible in part by the
13
International Computer Science Institute, located at Suite 600, 1947 Center
14
Street, Berkeley, California 94704.  Funding was partially provided by the
15
National Science Foundation under grant MIP-9311980.  The original version
16
of this code was written as part of a project to build a fixed-point vector
17
processor in collaboration with the University of California at Berkeley,
18
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20
arithmetic/SoftFloat.html'.
21

22
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30

31
Derivative works are acceptable, even for commercial purposes, so long as
32
(1) the source code for the derivative work includes prominent notice that
33
the work is derivative, and (2) the source code includes prominent notice with
34
these four paragraphs for those parts of this code that are retained.
35

36
=============================================================================*/
37

    
38
#include "softfloat.h"
39

    
40
/*----------------------------------------------------------------------------
41
| Primitive arithmetic functions, including multi-word arithmetic, and
42
| division and square root approximations.  (Can be specialized to target if
43
| desired.)
44
*----------------------------------------------------------------------------*/
45
#include "softfloat-macros.h"
46

    
47
/*----------------------------------------------------------------------------
48
| Functions and definitions to determine:  (1) whether tininess for underflow
49
| is detected before or after rounding by default, (2) what (if anything)
50
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
51
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52
| are propagated from function inputs to output.  These details are target-
53
| specific.
54
*----------------------------------------------------------------------------*/
55
#include "softfloat-specialize.h"
56

    
57
void set_float_rounding_mode(int val STATUS_PARAM)
58
{
59
    STATUS(float_rounding_mode) = val;
60
}
61

    
62
void set_float_exception_flags(int val STATUS_PARAM)
63
{
64
    STATUS(float_exception_flags) = val;
65
}
66

    
67
#ifdef FLOATX80
68
void set_floatx80_rounding_precision(int val STATUS_PARAM)
69
{
70
    STATUS(floatx80_rounding_precision) = val;
71
}
72
#endif
73

    
74
/*----------------------------------------------------------------------------
75
| Returns the fraction bits of the half-precision floating-point value `a'.
76
*----------------------------------------------------------------------------*/
77

    
78
INLINE uint32_t extractFloat16Frac(float16 a)
79
{
80
    return float16_val(a) & 0x3ff;
81
}
82

    
83
/*----------------------------------------------------------------------------
84
| Returns the exponent bits of the half-precision floating-point value `a'.
85
*----------------------------------------------------------------------------*/
86

    
87
INLINE int16 extractFloat16Exp(float16 a)
88
{
89
    return (float16_val(a) >> 10) & 0x1f;
90
}
91

    
92
/*----------------------------------------------------------------------------
93
| Returns the sign bit of the single-precision floating-point value `a'.
94
*----------------------------------------------------------------------------*/
95

    
96
INLINE flag extractFloat16Sign(float16 a)
97
{
98
    return float16_val(a)>>15;
99
}
100

    
101
/*----------------------------------------------------------------------------
102
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103
| and 7, and returns the properly rounded 32-bit integer corresponding to the
104
| input.  If `zSign' is 1, the input is negated before being converted to an
105
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
106
| is simply rounded to an integer, with the inexact exception raised if the
107
| input cannot be represented exactly as an integer.  However, if the fixed-
108
| point input is too large, the invalid exception is raised and the largest
109
| positive or negative integer is returned.
110
*----------------------------------------------------------------------------*/
111

    
112
static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
113
{
114
    int8 roundingMode;
115
    flag roundNearestEven;
116
    int8 roundIncrement, roundBits;
117
    int32 z;
118

    
119
    roundingMode = STATUS(float_rounding_mode);
120
    roundNearestEven = ( roundingMode == float_round_nearest_even );
121
    roundIncrement = 0x40;
122
    if ( ! roundNearestEven ) {
123
        if ( roundingMode == float_round_to_zero ) {
124
            roundIncrement = 0;
125
        }
126
        else {
127
            roundIncrement = 0x7F;
128
            if ( zSign ) {
129
                if ( roundingMode == float_round_up ) roundIncrement = 0;
130
            }
131
            else {
132
                if ( roundingMode == float_round_down ) roundIncrement = 0;
133
            }
134
        }
135
    }
136
    roundBits = absZ & 0x7F;
137
    absZ = ( absZ + roundIncrement )>>7;
138
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
139
    z = absZ;
140
    if ( zSign ) z = - z;
141
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
142
        float_raise( float_flag_invalid STATUS_VAR);
143
        return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
144
    }
145
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
146
    return z;
147

    
148
}
149

    
150
/*----------------------------------------------------------------------------
151
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152
| `absZ1', with binary point between bits 63 and 64 (between the input words),
153
| and returns the properly rounded 64-bit integer corresponding to the input.
154
| If `zSign' is 1, the input is negated before being converted to an integer.
155
| Ordinarily, the fixed-point input is simply rounded to an integer, with
156
| the inexact exception raised if the input cannot be represented exactly as
157
| an integer.  However, if the fixed-point input is too large, the invalid
158
| exception is raised and the largest positive or negative integer is
159
| returned.
160
*----------------------------------------------------------------------------*/
161

    
162
static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
163
{
164
    int8 roundingMode;
165
    flag roundNearestEven, increment;
166
    int64 z;
167

    
168
    roundingMode = STATUS(float_rounding_mode);
169
    roundNearestEven = ( roundingMode == float_round_nearest_even );
170
    increment = ( (int64_t) absZ1 < 0 );
171
    if ( ! roundNearestEven ) {
172
        if ( roundingMode == float_round_to_zero ) {
173
            increment = 0;
174
        }
175
        else {
176
            if ( zSign ) {
177
                increment = ( roundingMode == float_round_down ) && absZ1;
178
            }
179
            else {
180
                increment = ( roundingMode == float_round_up ) && absZ1;
181
            }
182
        }
183
    }
184
    if ( increment ) {
185
        ++absZ0;
186
        if ( absZ0 == 0 ) goto overflow;
187
        absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
188
    }
189
    z = absZ0;
190
    if ( zSign ) z = - z;
191
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
192
 overflow:
193
        float_raise( float_flag_invalid STATUS_VAR);
194
        return
195
              zSign ? (int64_t) LIT64( 0x8000000000000000 )
196
            : LIT64( 0x7FFFFFFFFFFFFFFF );
197
    }
198
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
199
    return z;
200

    
201
}
202

    
203
/*----------------------------------------------------------------------------
204
| Returns the fraction bits of the single-precision floating-point value `a'.
205
*----------------------------------------------------------------------------*/
206

    
207
INLINE uint32_t extractFloat32Frac( float32 a )
208
{
209

    
210
    return float32_val(a) & 0x007FFFFF;
211

    
212
}
213

    
214
/*----------------------------------------------------------------------------
215
| Returns the exponent bits of the single-precision floating-point value `a'.
216
*----------------------------------------------------------------------------*/
217

    
218
INLINE int16 extractFloat32Exp( float32 a )
219
{
220

    
221
    return ( float32_val(a)>>23 ) & 0xFF;
222

    
223
}
224

    
225
/*----------------------------------------------------------------------------
226
| Returns the sign bit of the single-precision floating-point value `a'.
227
*----------------------------------------------------------------------------*/
228

    
229
INLINE flag extractFloat32Sign( float32 a )
230
{
231

    
232
    return float32_val(a)>>31;
233

    
234
}
235

    
236
/*----------------------------------------------------------------------------
237
| If `a' is denormal and we are in flush-to-zero mode then set the
238
| input-denormal exception and return zero. Otherwise just return the value.
239
*----------------------------------------------------------------------------*/
240
static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
241
{
242
    if (STATUS(flush_inputs_to_zero)) {
243
        if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
244
            float_raise(float_flag_input_denormal STATUS_VAR);
245
            return make_float32(float32_val(a) & 0x80000000);
246
        }
247
    }
248
    return a;
249
}
250

    
251
/*----------------------------------------------------------------------------
252
| Normalizes the subnormal single-precision floating-point value represented
253
| by the denormalized significand `aSig'.  The normalized exponent and
254
| significand are stored at the locations pointed to by `zExpPtr' and
255
| `zSigPtr', respectively.
256
*----------------------------------------------------------------------------*/
257

    
258
static void
259
 normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
260
{
261
    int8 shiftCount;
262

    
263
    shiftCount = countLeadingZeros32( aSig ) - 8;
264
    *zSigPtr = aSig<<shiftCount;
265
    *zExpPtr = 1 - shiftCount;
266

    
267
}
268

    
269
/*----------------------------------------------------------------------------
270
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271
| single-precision floating-point value, returning the result.  After being
272
| shifted into the proper positions, the three fields are simply added
273
| together to form the result.  This means that any integer portion of `zSig'
274
| will be added into the exponent.  Since a properly normalized significand
275
| will have an integer portion equal to 1, the `zExp' input should be 1 less
276
| than the desired result exponent whenever `zSig' is a complete, normalized
277
| significand.
278
*----------------------------------------------------------------------------*/
279

    
280
INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
281
{
282

    
283
    return make_float32(
284
          ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
285

    
286
}
287

    
288
/*----------------------------------------------------------------------------
289
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290
| and significand `zSig', and returns the proper single-precision floating-
291
| point value corresponding to the abstract input.  Ordinarily, the abstract
292
| value is simply rounded and packed into the single-precision format, with
293
| the inexact exception raised if the abstract input cannot be represented
294
| exactly.  However, if the abstract value is too large, the overflow and
295
| inexact exceptions are raised and an infinity or maximal finite value is
296
| returned.  If the abstract value is too small, the input value is rounded to
297
| a subnormal number, and the underflow and inexact exceptions are raised if
298
| the abstract input cannot be represented exactly as a subnormal single-
299
| precision floating-point number.
300
|     The input significand `zSig' has its binary point between bits 30
301
| and 29, which is 7 bits to the left of the usual location.  This shifted
302
| significand must be normalized or smaller.  If `zSig' is not normalized,
303
| `zExp' must be 0; in that case, the result returned is a subnormal number,
304
| and it must not require rounding.  In the usual case that `zSig' is
305
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306
| The handling of underflow and overflow follows the IEC/IEEE Standard for
307
| Binary Floating-Point Arithmetic.
308
*----------------------------------------------------------------------------*/
309

    
310
static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
311
{
312
    int8 roundingMode;
313
    flag roundNearestEven;
314
    int8 roundIncrement, roundBits;
315
    flag isTiny;
316

    
317
    roundingMode = STATUS(float_rounding_mode);
318
    roundNearestEven = ( roundingMode == float_round_nearest_even );
319
    roundIncrement = 0x40;
320
    if ( ! roundNearestEven ) {
321
        if ( roundingMode == float_round_to_zero ) {
322
            roundIncrement = 0;
323
        }
324
        else {
325
            roundIncrement = 0x7F;
326
            if ( zSign ) {
327
                if ( roundingMode == float_round_up ) roundIncrement = 0;
328
            }
329
            else {
330
                if ( roundingMode == float_round_down ) roundIncrement = 0;
331
            }
332
        }
333
    }
334
    roundBits = zSig & 0x7F;
335
    if ( 0xFD <= (uint16_t) zExp ) {
336
        if (    ( 0xFD < zExp )
337
             || (    ( zExp == 0xFD )
338
                  && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
339
           ) {
340
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
341
            return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
342
        }
343
        if ( zExp < 0 ) {
344
            if (STATUS(flush_to_zero)) {
345
                float_raise(float_flag_output_denormal STATUS_VAR);
346
                return packFloat32(zSign, 0, 0);
347
            }
348
            isTiny =
349
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
350
                || ( zExp < -1 )
351
                || ( zSig + roundIncrement < 0x80000000 );
352
            shift32RightJamming( zSig, - zExp, &zSig );
353
            zExp = 0;
354
            roundBits = zSig & 0x7F;
355
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
356
        }
357
    }
358
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
359
    zSig = ( zSig + roundIncrement )>>7;
360
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
361
    if ( zSig == 0 ) zExp = 0;
362
    return packFloat32( zSign, zExp, zSig );
363

    
364
}
365

    
366
/*----------------------------------------------------------------------------
367
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368
| and significand `zSig', and returns the proper single-precision floating-
369
| point value corresponding to the abstract input.  This routine is just like
370
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
371
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
372
| floating-point exponent.
373
*----------------------------------------------------------------------------*/
374

    
375
static float32
376
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
377
{
378
    int8 shiftCount;
379

    
380
    shiftCount = countLeadingZeros32( zSig ) - 1;
381
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
382

    
383
}
384

    
385
/*----------------------------------------------------------------------------
386
| Returns the fraction bits of the double-precision floating-point value `a'.
387
*----------------------------------------------------------------------------*/
388

    
389
INLINE uint64_t extractFloat64Frac( float64 a )
390
{
391

    
392
    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
393

    
394
}
395

    
396
/*----------------------------------------------------------------------------
397
| Returns the exponent bits of the double-precision floating-point value `a'.
398
*----------------------------------------------------------------------------*/
399

    
400
INLINE int16 extractFloat64Exp( float64 a )
401
{
402

    
403
    return ( float64_val(a)>>52 ) & 0x7FF;
404

    
405
}
406

    
407
/*----------------------------------------------------------------------------
408
| Returns the sign bit of the double-precision floating-point value `a'.
409
*----------------------------------------------------------------------------*/
410

    
411
INLINE flag extractFloat64Sign( float64 a )
412
{
413

    
414
    return float64_val(a)>>63;
415

    
416
}
417

    
418
/*----------------------------------------------------------------------------
419
| If `a' is denormal and we are in flush-to-zero mode then set the
420
| input-denormal exception and return zero. Otherwise just return the value.
421
*----------------------------------------------------------------------------*/
422
static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
423
{
424
    if (STATUS(flush_inputs_to_zero)) {
425
        if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
426
            float_raise(float_flag_input_denormal STATUS_VAR);
427
            return make_float64(float64_val(a) & (1ULL << 63));
428
        }
429
    }
430
    return a;
431
}
432

    
433
/*----------------------------------------------------------------------------
434
| Normalizes the subnormal double-precision floating-point value represented
435
| by the denormalized significand `aSig'.  The normalized exponent and
436
| significand are stored at the locations pointed to by `zExpPtr' and
437
| `zSigPtr', respectively.
438
*----------------------------------------------------------------------------*/
439

    
440
static void
441
 normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
442
{
443
    int8 shiftCount;
444

    
445
    shiftCount = countLeadingZeros64( aSig ) - 11;
446
    *zSigPtr = aSig<<shiftCount;
447
    *zExpPtr = 1 - shiftCount;
448

    
449
}
450

    
451
/*----------------------------------------------------------------------------
452
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
453
| double-precision floating-point value, returning the result.  After being
454
| shifted into the proper positions, the three fields are simply added
455
| together to form the result.  This means that any integer portion of `zSig'
456
| will be added into the exponent.  Since a properly normalized significand
457
| will have an integer portion equal to 1, the `zExp' input should be 1 less
458
| than the desired result exponent whenever `zSig' is a complete, normalized
459
| significand.
460
*----------------------------------------------------------------------------*/
461

    
462
INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
463
{
464

    
465
    return make_float64(
466
        ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
467

    
468
}
469

    
470
/*----------------------------------------------------------------------------
471
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472
| and significand `zSig', and returns the proper double-precision floating-
473
| point value corresponding to the abstract input.  Ordinarily, the abstract
474
| value is simply rounded and packed into the double-precision format, with
475
| the inexact exception raised if the abstract input cannot be represented
476
| exactly.  However, if the abstract value is too large, the overflow and
477
| inexact exceptions are raised and an infinity or maximal finite value is
478
| returned.  If the abstract value is too small, the input value is rounded
479
| to a subnormal number, and the underflow and inexact exceptions are raised
480
| if the abstract input cannot be represented exactly as a subnormal double-
481
| precision floating-point number.
482
|     The input significand `zSig' has its binary point between bits 62
483
| and 61, which is 10 bits to the left of the usual location.  This shifted
484
| significand must be normalized or smaller.  If `zSig' is not normalized,
485
| `zExp' must be 0; in that case, the result returned is a subnormal number,
486
| and it must not require rounding.  In the usual case that `zSig' is
487
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
488
| The handling of underflow and overflow follows the IEC/IEEE Standard for
489
| Binary Floating-Point Arithmetic.
490
*----------------------------------------------------------------------------*/
491

    
492
static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
493
{
494
    int8 roundingMode;
495
    flag roundNearestEven;
496
    int16 roundIncrement, roundBits;
497
    flag isTiny;
498

    
499
    roundingMode = STATUS(float_rounding_mode);
500
    roundNearestEven = ( roundingMode == float_round_nearest_even );
501
    roundIncrement = 0x200;
502
    if ( ! roundNearestEven ) {
503
        if ( roundingMode == float_round_to_zero ) {
504
            roundIncrement = 0;
505
        }
506
        else {
507
            roundIncrement = 0x3FF;
508
            if ( zSign ) {
509
                if ( roundingMode == float_round_up ) roundIncrement = 0;
510
            }
511
            else {
512
                if ( roundingMode == float_round_down ) roundIncrement = 0;
513
            }
514
        }
515
    }
516
    roundBits = zSig & 0x3FF;
517
    if ( 0x7FD <= (uint16_t) zExp ) {
518
        if (    ( 0x7FD < zExp )
519
             || (    ( zExp == 0x7FD )
520
                  && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
521
           ) {
522
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
523
            return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
524
        }
525
        if ( zExp < 0 ) {
526
            if (STATUS(flush_to_zero)) {
527
                float_raise(float_flag_output_denormal STATUS_VAR);
528
                return packFloat64(zSign, 0, 0);
529
            }
530
            isTiny =
531
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
532
                || ( zExp < -1 )
533
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
534
            shift64RightJamming( zSig, - zExp, &zSig );
535
            zExp = 0;
536
            roundBits = zSig & 0x3FF;
537
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
538
        }
539
    }
540
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
541
    zSig = ( zSig + roundIncrement )>>10;
542
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
543
    if ( zSig == 0 ) zExp = 0;
544
    return packFloat64( zSign, zExp, zSig );
545

    
546
}
547

    
548
/*----------------------------------------------------------------------------
549
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
550
| and significand `zSig', and returns the proper double-precision floating-
551
| point value corresponding to the abstract input.  This routine is just like
552
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
553
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
554
| floating-point exponent.
555
*----------------------------------------------------------------------------*/
556

    
557
static float64
558
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
559
{
560
    int8 shiftCount;
561

    
562
    shiftCount = countLeadingZeros64( zSig ) - 1;
563
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
564

    
565
}
566

    
567
#ifdef FLOATX80
568

    
569
/*----------------------------------------------------------------------------
570
| Returns the fraction bits of the extended double-precision floating-point
571
| value `a'.
572
*----------------------------------------------------------------------------*/
573

    
574
INLINE uint64_t extractFloatx80Frac( floatx80 a )
575
{
576

    
577
    return a.low;
578

    
579
}
580

    
581
/*----------------------------------------------------------------------------
582
| Returns the exponent bits of the extended double-precision floating-point
583
| value `a'.
584
*----------------------------------------------------------------------------*/
585

    
586
INLINE int32 extractFloatx80Exp( floatx80 a )
587
{
588

    
589
    return a.high & 0x7FFF;
590

    
591
}
592

    
593
/*----------------------------------------------------------------------------
594
| Returns the sign bit of the extended double-precision floating-point value
595
| `a'.
596
*----------------------------------------------------------------------------*/
597

    
598
INLINE flag extractFloatx80Sign( floatx80 a )
599
{
600

    
601
    return a.high>>15;
602

    
603
}
604

    
605
/*----------------------------------------------------------------------------
606
| Normalizes the subnormal extended double-precision floating-point value
607
| represented by the denormalized significand `aSig'.  The normalized exponent
608
| and significand are stored at the locations pointed to by `zExpPtr' and
609
| `zSigPtr', respectively.
610
*----------------------------------------------------------------------------*/
611

    
612
static void
613
 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
614
{
615
    int8 shiftCount;
616

    
617
    shiftCount = countLeadingZeros64( aSig );
618
    *zSigPtr = aSig<<shiftCount;
619
    *zExpPtr = 1 - shiftCount;
620

    
621
}
622

    
623
/*----------------------------------------------------------------------------
624
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
625
| extended double-precision floating-point value, returning the result.
626
*----------------------------------------------------------------------------*/
627

    
628
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
629
{
630
    floatx80 z;
631

    
632
    z.low = zSig;
633
    z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
634
    return z;
635

    
636
}
637

    
638
/*----------------------------------------------------------------------------
639
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
640
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
641
| and returns the proper extended double-precision floating-point value
642
| corresponding to the abstract input.  Ordinarily, the abstract value is
643
| rounded and packed into the extended double-precision format, with the
644
| inexact exception raised if the abstract input cannot be represented
645
| exactly.  However, if the abstract value is too large, the overflow and
646
| inexact exceptions are raised and an infinity or maximal finite value is
647
| returned.  If the abstract value is too small, the input value is rounded to
648
| a subnormal number, and the underflow and inexact exceptions are raised if
649
| the abstract input cannot be represented exactly as a subnormal extended
650
| double-precision floating-point number.
651
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
652
| number of bits as single or double precision, respectively.  Otherwise, the
653
| result is rounded to the full precision of the extended double-precision
654
| format.
655
|     The input significand must be normalized or smaller.  If the input
656
| significand is not normalized, `zExp' must be 0; in that case, the result
657
| returned is a subnormal number, and it must not require rounding.  The
658
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
659
| Floating-Point Arithmetic.
660
*----------------------------------------------------------------------------*/
661

    
662
static floatx80
663
 roundAndPackFloatx80(
664
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
665
 STATUS_PARAM)
666
{
667
    int8 roundingMode;
668
    flag roundNearestEven, increment, isTiny;
669
    int64 roundIncrement, roundMask, roundBits;
670

    
671
    roundingMode = STATUS(float_rounding_mode);
672
    roundNearestEven = ( roundingMode == float_round_nearest_even );
673
    if ( roundingPrecision == 80 ) goto precision80;
674
    if ( roundingPrecision == 64 ) {
675
        roundIncrement = LIT64( 0x0000000000000400 );
676
        roundMask = LIT64( 0x00000000000007FF );
677
    }
678
    else if ( roundingPrecision == 32 ) {
679
        roundIncrement = LIT64( 0x0000008000000000 );
680
        roundMask = LIT64( 0x000000FFFFFFFFFF );
681
    }
682
    else {
683
        goto precision80;
684
    }
685
    zSig0 |= ( zSig1 != 0 );
686
    if ( ! roundNearestEven ) {
687
        if ( roundingMode == float_round_to_zero ) {
688
            roundIncrement = 0;
689
        }
690
        else {
691
            roundIncrement = roundMask;
692
            if ( zSign ) {
693
                if ( roundingMode == float_round_up ) roundIncrement = 0;
694
            }
695
            else {
696
                if ( roundingMode == float_round_down ) roundIncrement = 0;
697
            }
698
        }
699
    }
700
    roundBits = zSig0 & roundMask;
701
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
702
        if (    ( 0x7FFE < zExp )
703
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
704
           ) {
705
            goto overflow;
706
        }
707
        if ( zExp <= 0 ) {
708
            if (STATUS(flush_to_zero)) {
709
                float_raise(float_flag_output_denormal STATUS_VAR);
710
                return packFloatx80(zSign, 0, 0);
711
            }
712
            isTiny =
713
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
714
                || ( zExp < 0 )
715
                || ( zSig0 <= zSig0 + roundIncrement );
716
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
717
            zExp = 0;
718
            roundBits = zSig0 & roundMask;
719
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
720
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
721
            zSig0 += roundIncrement;
722
            if ( (int64_t) zSig0 < 0 ) zExp = 1;
723
            roundIncrement = roundMask + 1;
724
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
725
                roundMask |= roundIncrement;
726
            }
727
            zSig0 &= ~ roundMask;
728
            return packFloatx80( zSign, zExp, zSig0 );
729
        }
730
    }
731
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
732
    zSig0 += roundIncrement;
733
    if ( zSig0 < roundIncrement ) {
734
        ++zExp;
735
        zSig0 = LIT64( 0x8000000000000000 );
736
    }
737
    roundIncrement = roundMask + 1;
738
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
739
        roundMask |= roundIncrement;
740
    }
741
    zSig0 &= ~ roundMask;
742
    if ( zSig0 == 0 ) zExp = 0;
743
    return packFloatx80( zSign, zExp, zSig0 );
744
 precision80:
745
    increment = ( (int64_t) zSig1 < 0 );
746
    if ( ! roundNearestEven ) {
747
        if ( roundingMode == float_round_to_zero ) {
748
            increment = 0;
749
        }
750
        else {
751
            if ( zSign ) {
752
                increment = ( roundingMode == float_round_down ) && zSig1;
753
            }
754
            else {
755
                increment = ( roundingMode == float_round_up ) && zSig1;
756
            }
757
        }
758
    }
759
    if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
760
        if (    ( 0x7FFE < zExp )
761
             || (    ( zExp == 0x7FFE )
762
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
763
                  && increment
764
                )
765
           ) {
766
            roundMask = 0;
767
 overflow:
768
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
769
            if (    ( roundingMode == float_round_to_zero )
770
                 || ( zSign && ( roundingMode == float_round_up ) )
771
                 || ( ! zSign && ( roundingMode == float_round_down ) )
772
               ) {
773
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
774
            }
775
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
776
        }
777
        if ( zExp <= 0 ) {
778
            isTiny =
779
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
780
                || ( zExp < 0 )
781
                || ! increment
782
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
783
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
784
            zExp = 0;
785
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
786
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
787
            if ( roundNearestEven ) {
788
                increment = ( (int64_t) zSig1 < 0 );
789
            }
790
            else {
791
                if ( zSign ) {
792
                    increment = ( roundingMode == float_round_down ) && zSig1;
793
                }
794
                else {
795
                    increment = ( roundingMode == float_round_up ) && zSig1;
796
                }
797
            }
798
            if ( increment ) {
799
                ++zSig0;
800
                zSig0 &=
801
                    ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
802
                if ( (int64_t) zSig0 < 0 ) zExp = 1;
803
            }
804
            return packFloatx80( zSign, zExp, zSig0 );
805
        }
806
    }
807
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
808
    if ( increment ) {
809
        ++zSig0;
810
        if ( zSig0 == 0 ) {
811
            ++zExp;
812
            zSig0 = LIT64( 0x8000000000000000 );
813
        }
814
        else {
815
            zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
816
        }
817
    }
818
    else {
819
        if ( zSig0 == 0 ) zExp = 0;
820
    }
821
    return packFloatx80( zSign, zExp, zSig0 );
822

    
823
}
824

    
825
/*----------------------------------------------------------------------------
826
| Takes an abstract floating-point value having sign `zSign', exponent
827
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
828
| and returns the proper extended double-precision floating-point value
829
| corresponding to the abstract input.  This routine is just like
830
| `roundAndPackFloatx80' except that the input significand does not have to be
831
| normalized.
832
*----------------------------------------------------------------------------*/
833

    
834
static floatx80
835
 normalizeRoundAndPackFloatx80(
836
     int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
837
 STATUS_PARAM)
838
{
839
    int8 shiftCount;
840

    
841
    if ( zSig0 == 0 ) {
842
        zSig0 = zSig1;
843
        zSig1 = 0;
844
        zExp -= 64;
845
    }
846
    shiftCount = countLeadingZeros64( zSig0 );
847
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
848
    zExp -= shiftCount;
849
    return
850
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
851

    
852
}
853

    
854
#endif
855

    
856
#ifdef FLOAT128
857

    
858
/*----------------------------------------------------------------------------
859
| Returns the least-significant 64 fraction bits of the quadruple-precision
860
| floating-point value `a'.
861
*----------------------------------------------------------------------------*/
862

    
863
INLINE uint64_t extractFloat128Frac1( float128 a )
864
{
865

    
866
    return a.low;
867

    
868
}
869

    
870
/*----------------------------------------------------------------------------
871
| Returns the most-significant 48 fraction bits of the quadruple-precision
872
| floating-point value `a'.
873
*----------------------------------------------------------------------------*/
874

    
875
INLINE uint64_t extractFloat128Frac0( float128 a )
876
{
877

    
878
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
879

    
880
}
881

    
882
/*----------------------------------------------------------------------------
883
| Returns the exponent bits of the quadruple-precision floating-point value
884
| `a'.
885
*----------------------------------------------------------------------------*/
886

    
887
INLINE int32 extractFloat128Exp( float128 a )
888
{
889

    
890
    return ( a.high>>48 ) & 0x7FFF;
891

    
892
}
893

    
894
/*----------------------------------------------------------------------------
895
| Returns the sign bit of the quadruple-precision floating-point value `a'.
896
*----------------------------------------------------------------------------*/
897

    
898
INLINE flag extractFloat128Sign( float128 a )
899
{
900

    
901
    return a.high>>63;
902

    
903
}
904

    
905
/*----------------------------------------------------------------------------
906
| Normalizes the subnormal quadruple-precision floating-point value
907
| represented by the denormalized significand formed by the concatenation of
908
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
909
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
910
| significand are stored at the location pointed to by `zSig0Ptr', and the
911
| least significant 64 bits of the normalized significand are stored at the
912
| location pointed to by `zSig1Ptr'.
913
*----------------------------------------------------------------------------*/
914

    
915
static void
916
 normalizeFloat128Subnormal(
917
     uint64_t aSig0,
918
     uint64_t aSig1,
919
     int32 *zExpPtr,
920
     uint64_t *zSig0Ptr,
921
     uint64_t *zSig1Ptr
922
 )
923
{
924
    int8 shiftCount;
925

    
926
    if ( aSig0 == 0 ) {
927
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
928
        if ( shiftCount < 0 ) {
929
            *zSig0Ptr = aSig1>>( - shiftCount );
930
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
931
        }
932
        else {
933
            *zSig0Ptr = aSig1<<shiftCount;
934
            *zSig1Ptr = 0;
935
        }
936
        *zExpPtr = - shiftCount - 63;
937
    }
938
    else {
939
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
940
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
941
        *zExpPtr = 1 - shiftCount;
942
    }
943

    
944
}
945

    
946
/*----------------------------------------------------------------------------
947
| Packs the sign `zSign', the exponent `zExp', and the significand formed
948
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
949
| floating-point value, returning the result.  After being shifted into the
950
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
951
| added together to form the most significant 32 bits of the result.  This
952
| means that any integer portion of `zSig0' will be added into the exponent.
953
| Since a properly normalized significand will have an integer portion equal
954
| to 1, the `zExp' input should be 1 less than the desired result exponent
955
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
956
| significand.
957
*----------------------------------------------------------------------------*/
958

    
959
INLINE float128
960
 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
961
{
962
    float128 z;
963

    
964
    z.low = zSig1;
965
    z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
966
    return z;
967

    
968
}
969

    
970
/*----------------------------------------------------------------------------
971
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
972
| and extended significand formed by the concatenation of `zSig0', `zSig1',
973
| and `zSig2', and returns the proper quadruple-precision floating-point value
974
| corresponding to the abstract input.  Ordinarily, the abstract value is
975
| simply rounded and packed into the quadruple-precision format, with the
976
| inexact exception raised if the abstract input cannot be represented
977
| exactly.  However, if the abstract value is too large, the overflow and
978
| inexact exceptions are raised and an infinity or maximal finite value is
979
| returned.  If the abstract value is too small, the input value is rounded to
980
| a subnormal number, and the underflow and inexact exceptions are raised if
981
| the abstract input cannot be represented exactly as a subnormal quadruple-
982
| precision floating-point number.
983
|     The input significand must be normalized or smaller.  If the input
984
| significand is not normalized, `zExp' must be 0; in that case, the result
985
| returned is a subnormal number, and it must not require rounding.  In the
986
| usual case that the input significand is normalized, `zExp' must be 1 less
987
| than the ``true'' floating-point exponent.  The handling of underflow and
988
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
989
*----------------------------------------------------------------------------*/
990

    
991
static float128
992
 roundAndPackFloat128(
993
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
994
{
995
    int8 roundingMode;
996
    flag roundNearestEven, increment, isTiny;
997

    
998
    roundingMode = STATUS(float_rounding_mode);
999
    roundNearestEven = ( roundingMode == float_round_nearest_even );
1000
    increment = ( (int64_t) zSig2 < 0 );
1001
    if ( ! roundNearestEven ) {
1002
        if ( roundingMode == float_round_to_zero ) {
1003
            increment = 0;
1004
        }
1005
        else {
1006
            if ( zSign ) {
1007
                increment = ( roundingMode == float_round_down ) && zSig2;
1008
            }
1009
            else {
1010
                increment = ( roundingMode == float_round_up ) && zSig2;
1011
            }
1012
        }
1013
    }
1014
    if ( 0x7FFD <= (uint32_t) zExp ) {
1015
        if (    ( 0x7FFD < zExp )
1016
             || (    ( zExp == 0x7FFD )
1017
                  && eq128(
1018
                         LIT64( 0x0001FFFFFFFFFFFF ),
1019
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
1020
                         zSig0,
1021
                         zSig1
1022
                     )
1023
                  && increment
1024
                )
1025
           ) {
1026
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1027
            if (    ( roundingMode == float_round_to_zero )
1028
                 || ( zSign && ( roundingMode == float_round_up ) )
1029
                 || ( ! zSign && ( roundingMode == float_round_down ) )
1030
               ) {
1031
                return
1032
                    packFloat128(
1033
                        zSign,
1034
                        0x7FFE,
1035
                        LIT64( 0x0000FFFFFFFFFFFF ),
1036
                        LIT64( 0xFFFFFFFFFFFFFFFF )
1037
                    );
1038
            }
1039
            return packFloat128( zSign, 0x7FFF, 0, 0 );
1040
        }
1041
        if ( zExp < 0 ) {
1042
            if (STATUS(flush_to_zero)) {
1043
                float_raise(float_flag_output_denormal STATUS_VAR);
1044
                return packFloat128(zSign, 0, 0, 0);
1045
            }
1046
            isTiny =
1047
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1048
                || ( zExp < -1 )
1049
                || ! increment
1050
                || lt128(
1051
                       zSig0,
1052
                       zSig1,
1053
                       LIT64( 0x0001FFFFFFFFFFFF ),
1054
                       LIT64( 0xFFFFFFFFFFFFFFFF )
1055
                   );
1056
            shift128ExtraRightJamming(
1057
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1058
            zExp = 0;
1059
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1060
            if ( roundNearestEven ) {
1061
                increment = ( (int64_t) zSig2 < 0 );
1062
            }
1063
            else {
1064
                if ( zSign ) {
1065
                    increment = ( roundingMode == float_round_down ) && zSig2;
1066
                }
1067
                else {
1068
                    increment = ( roundingMode == float_round_up ) && zSig2;
1069
                }
1070
            }
1071
        }
1072
    }
1073
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1074
    if ( increment ) {
1075
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1076
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1077
    }
1078
    else {
1079
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1080
    }
1081
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1082

    
1083
}
1084

    
1085
/*----------------------------------------------------------------------------
1086
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1087
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1088
| returns the proper quadruple-precision floating-point value corresponding
1089
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1090
| except that the input significand has fewer bits and does not have to be
1091
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1092
| point exponent.
1093
*----------------------------------------------------------------------------*/
1094

    
1095
static float128
1096
 normalizeRoundAndPackFloat128(
1097
     flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1098
{
1099
    int8 shiftCount;
1100
    uint64_t zSig2;
1101

    
1102
    if ( zSig0 == 0 ) {
1103
        zSig0 = zSig1;
1104
        zSig1 = 0;
1105
        zExp -= 64;
1106
    }
1107
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1108
    if ( 0 <= shiftCount ) {
1109
        zSig2 = 0;
1110
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1111
    }
1112
    else {
1113
        shift128ExtraRightJamming(
1114
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1115
    }
1116
    zExp -= shiftCount;
1117
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1118

    
1119
}
1120

    
1121
#endif
1122

    
1123
/*----------------------------------------------------------------------------
1124
| Returns the result of converting the 32-bit two's complement integer `a'
1125
| to the single-precision floating-point format.  The conversion is performed
1126
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127
*----------------------------------------------------------------------------*/
1128

    
1129
float32 int32_to_float32( int32 a STATUS_PARAM )
1130
{
1131
    flag zSign;
1132

    
1133
    if ( a == 0 ) return float32_zero;
1134
    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1135
    zSign = ( a < 0 );
1136
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1137

    
1138
}
1139

    
1140
/*----------------------------------------------------------------------------
1141
| Returns the result of converting the 32-bit two's complement integer `a'
1142
| to the double-precision floating-point format.  The conversion is performed
1143
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1144
*----------------------------------------------------------------------------*/
1145

    
1146
float64 int32_to_float64( int32 a STATUS_PARAM )
1147
{
1148
    flag zSign;
1149
    uint32 absA;
1150
    int8 shiftCount;
1151
    uint64_t zSig;
1152

    
1153
    if ( a == 0 ) return float64_zero;
1154
    zSign = ( a < 0 );
1155
    absA = zSign ? - a : a;
1156
    shiftCount = countLeadingZeros32( absA ) + 21;
1157
    zSig = absA;
1158
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1159

    
1160
}
1161

    
1162
#ifdef FLOATX80
1163

    
1164
/*----------------------------------------------------------------------------
1165
| Returns the result of converting the 32-bit two's complement integer `a'
1166
| to the extended double-precision floating-point format.  The conversion
1167
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1168
| Arithmetic.
1169
*----------------------------------------------------------------------------*/
1170

    
1171
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1172
{
1173
    flag zSign;
1174
    uint32 absA;
1175
    int8 shiftCount;
1176
    uint64_t zSig;
1177

    
1178
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1179
    zSign = ( a < 0 );
1180
    absA = zSign ? - a : a;
1181
    shiftCount = countLeadingZeros32( absA ) + 32;
1182
    zSig = absA;
1183
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1184

    
1185
}
1186

    
1187
#endif
1188

    
1189
#ifdef FLOAT128
1190

    
1191
/*----------------------------------------------------------------------------
1192
| Returns the result of converting the 32-bit two's complement integer `a' to
1193
| the quadruple-precision floating-point format.  The conversion is performed
1194
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195
*----------------------------------------------------------------------------*/
1196

    
1197
float128 int32_to_float128( int32 a STATUS_PARAM )
1198
{
1199
    flag zSign;
1200
    uint32 absA;
1201
    int8 shiftCount;
1202
    uint64_t zSig0;
1203

    
1204
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1205
    zSign = ( a < 0 );
1206
    absA = zSign ? - a : a;
1207
    shiftCount = countLeadingZeros32( absA ) + 17;
1208
    zSig0 = absA;
1209
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1210

    
1211
}
1212

    
1213
#endif
1214

    
1215
/*----------------------------------------------------------------------------
1216
| Returns the result of converting the 64-bit two's complement integer `a'
1217
| to the single-precision floating-point format.  The conversion is performed
1218
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219
*----------------------------------------------------------------------------*/
1220

    
1221
float32 int64_to_float32( int64 a STATUS_PARAM )
1222
{
1223
    flag zSign;
1224
    uint64 absA;
1225
    int8 shiftCount;
1226

    
1227
    if ( a == 0 ) return float32_zero;
1228
    zSign = ( a < 0 );
1229
    absA = zSign ? - a : a;
1230
    shiftCount = countLeadingZeros64( absA ) - 40;
1231
    if ( 0 <= shiftCount ) {
1232
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1233
    }
1234
    else {
1235
        shiftCount += 7;
1236
        if ( shiftCount < 0 ) {
1237
            shift64RightJamming( absA, - shiftCount, &absA );
1238
        }
1239
        else {
1240
            absA <<= shiftCount;
1241
        }
1242
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1243
    }
1244

    
1245
}
1246

    
1247
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1248
{
1249
    int8 shiftCount;
1250

    
1251
    if ( a == 0 ) return float32_zero;
1252
    shiftCount = countLeadingZeros64( a ) - 40;
1253
    if ( 0 <= shiftCount ) {
1254
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1255
    }
1256
    else {
1257
        shiftCount += 7;
1258
        if ( shiftCount < 0 ) {
1259
            shift64RightJamming( a, - shiftCount, &a );
1260
        }
1261
        else {
1262
            a <<= shiftCount;
1263
        }
1264
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1265
    }
1266
}
1267

    
1268
/*----------------------------------------------------------------------------
1269
| Returns the result of converting the 64-bit two's complement integer `a'
1270
| to the double-precision floating-point format.  The conversion is performed
1271
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1272
*----------------------------------------------------------------------------*/
1273

    
1274
float64 int64_to_float64( int64 a STATUS_PARAM )
1275
{
1276
    flag zSign;
1277

    
1278
    if ( a == 0 ) return float64_zero;
1279
    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1280
        return packFloat64( 1, 0x43E, 0 );
1281
    }
1282
    zSign = ( a < 0 );
1283
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1284

    
1285
}
1286

    
1287
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1288
{
1289
    if ( a == 0 ) return float64_zero;
1290
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1291

    
1292
}
1293

    
1294
#ifdef FLOATX80
1295

    
1296
/*----------------------------------------------------------------------------
1297
| Returns the result of converting the 64-bit two's complement integer `a'
1298
| to the extended double-precision floating-point format.  The conversion
1299
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1300
| Arithmetic.
1301
*----------------------------------------------------------------------------*/
1302

    
1303
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1304
{
1305
    flag zSign;
1306
    uint64 absA;
1307
    int8 shiftCount;
1308

    
1309
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1310
    zSign = ( a < 0 );
1311
    absA = zSign ? - a : a;
1312
    shiftCount = countLeadingZeros64( absA );
1313
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1314

    
1315
}
1316

    
1317
#endif
1318

    
1319
#ifdef FLOAT128
1320

    
1321
/*----------------------------------------------------------------------------
1322
| Returns the result of converting the 64-bit two's complement integer `a' to
1323
| the quadruple-precision floating-point format.  The conversion is performed
1324
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1325
*----------------------------------------------------------------------------*/
1326

    
1327
float128 int64_to_float128( int64 a STATUS_PARAM )
1328
{
1329
    flag zSign;
1330
    uint64 absA;
1331
    int8 shiftCount;
1332
    int32 zExp;
1333
    uint64_t zSig0, zSig1;
1334

    
1335
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1336
    zSign = ( a < 0 );
1337
    absA = zSign ? - a : a;
1338
    shiftCount = countLeadingZeros64( absA ) + 49;
1339
    zExp = 0x406E - shiftCount;
1340
    if ( 64 <= shiftCount ) {
1341
        zSig1 = 0;
1342
        zSig0 = absA;
1343
        shiftCount -= 64;
1344
    }
1345
    else {
1346
        zSig1 = absA;
1347
        zSig0 = 0;
1348
    }
1349
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1350
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1351

    
1352
}
1353

    
1354
#endif
1355

    
1356
/*----------------------------------------------------------------------------
1357
| Returns the result of converting the single-precision floating-point value
1358
| `a' to the 32-bit two's complement integer format.  The conversion is
1359
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360
| Arithmetic---which means in particular that the conversion is rounded
1361
| according to the current rounding mode.  If `a' is a NaN, the largest
1362
| positive integer is returned.  Otherwise, if the conversion overflows, the
1363
| largest integer with the same sign as `a' is returned.
1364
*----------------------------------------------------------------------------*/
1365

    
1366
int32 float32_to_int32( float32 a STATUS_PARAM )
1367
{
1368
    flag aSign;
1369
    int16 aExp, shiftCount;
1370
    uint32_t aSig;
1371
    uint64_t aSig64;
1372

    
1373
    a = float32_squash_input_denormal(a STATUS_VAR);
1374
    aSig = extractFloat32Frac( a );
1375
    aExp = extractFloat32Exp( a );
1376
    aSign = extractFloat32Sign( a );
1377
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1378
    if ( aExp ) aSig |= 0x00800000;
1379
    shiftCount = 0xAF - aExp;
1380
    aSig64 = aSig;
1381
    aSig64 <<= 32;
1382
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1383
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1384

    
1385
}
1386

    
1387
/*----------------------------------------------------------------------------
1388
| Returns the result of converting the single-precision floating-point value
1389
| `a' to the 32-bit two's complement integer format.  The conversion is
1390
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1391
| Arithmetic, except that the conversion is always rounded toward zero.
1392
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1393
| the conversion overflows, the largest integer with the same sign as `a' is
1394
| returned.
1395
*----------------------------------------------------------------------------*/
1396

    
1397
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1398
{
1399
    flag aSign;
1400
    int16 aExp, shiftCount;
1401
    uint32_t aSig;
1402
    int32 z;
1403
    a = float32_squash_input_denormal(a STATUS_VAR);
1404

    
1405
    aSig = extractFloat32Frac( a );
1406
    aExp = extractFloat32Exp( a );
1407
    aSign = extractFloat32Sign( a );
1408
    shiftCount = aExp - 0x9E;
1409
    if ( 0 <= shiftCount ) {
1410
        if ( float32_val(a) != 0xCF000000 ) {
1411
            float_raise( float_flag_invalid STATUS_VAR);
1412
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1413
        }
1414
        return (int32_t) 0x80000000;
1415
    }
1416
    else if ( aExp <= 0x7E ) {
1417
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1418
        return 0;
1419
    }
1420
    aSig = ( aSig | 0x00800000 )<<8;
1421
    z = aSig>>( - shiftCount );
1422
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1423
        STATUS(float_exception_flags) |= float_flag_inexact;
1424
    }
1425
    if ( aSign ) z = - z;
1426
    return z;
1427

    
1428
}
1429

    
1430
/*----------------------------------------------------------------------------
1431
| Returns the result of converting the single-precision floating-point value
1432
| `a' to the 16-bit two's complement integer format.  The conversion is
1433
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1434
| Arithmetic, except that the conversion is always rounded toward zero.
1435
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1436
| the conversion overflows, the largest integer with the same sign as `a' is
1437
| returned.
1438
*----------------------------------------------------------------------------*/
1439

    
1440
int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1441
{
1442
    flag aSign;
1443
    int16 aExp, shiftCount;
1444
    uint32_t aSig;
1445
    int32 z;
1446

    
1447
    aSig = extractFloat32Frac( a );
1448
    aExp = extractFloat32Exp( a );
1449
    aSign = extractFloat32Sign( a );
1450
    shiftCount = aExp - 0x8E;
1451
    if ( 0 <= shiftCount ) {
1452
        if ( float32_val(a) != 0xC7000000 ) {
1453
            float_raise( float_flag_invalid STATUS_VAR);
1454
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1455
                return 0x7FFF;
1456
            }
1457
        }
1458
        return (int32_t) 0xffff8000;
1459
    }
1460
    else if ( aExp <= 0x7E ) {
1461
        if ( aExp | aSig ) {
1462
            STATUS(float_exception_flags) |= float_flag_inexact;
1463
        }
1464
        return 0;
1465
    }
1466
    shiftCount -= 0x10;
1467
    aSig = ( aSig | 0x00800000 )<<8;
1468
    z = aSig>>( - shiftCount );
1469
    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1470
        STATUS(float_exception_flags) |= float_flag_inexact;
1471
    }
1472
    if ( aSign ) {
1473
        z = - z;
1474
    }
1475
    return z;
1476

    
1477
}
1478

    
1479
/*----------------------------------------------------------------------------
1480
| Returns the result of converting the single-precision floating-point value
1481
| `a' to the 64-bit two's complement integer format.  The conversion is
1482
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1483
| Arithmetic---which means in particular that the conversion is rounded
1484
| according to the current rounding mode.  If `a' is a NaN, the largest
1485
| positive integer is returned.  Otherwise, if the conversion overflows, the
1486
| largest integer with the same sign as `a' is returned.
1487
*----------------------------------------------------------------------------*/
1488

    
1489
int64 float32_to_int64( float32 a STATUS_PARAM )
1490
{
1491
    flag aSign;
1492
    int16 aExp, shiftCount;
1493
    uint32_t aSig;
1494
    uint64_t aSig64, aSigExtra;
1495
    a = float32_squash_input_denormal(a STATUS_VAR);
1496

    
1497
    aSig = extractFloat32Frac( a );
1498
    aExp = extractFloat32Exp( a );
1499
    aSign = extractFloat32Sign( a );
1500
    shiftCount = 0xBE - aExp;
1501
    if ( shiftCount < 0 ) {
1502
        float_raise( float_flag_invalid STATUS_VAR);
1503
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1504
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1505
        }
1506
        return (int64_t) LIT64( 0x8000000000000000 );
1507
    }
1508
    if ( aExp ) aSig |= 0x00800000;
1509
    aSig64 = aSig;
1510
    aSig64 <<= 40;
1511
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1512
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1513

    
1514
}
1515

    
1516
/*----------------------------------------------------------------------------
1517
| Returns the result of converting the single-precision floating-point value
1518
| `a' to the 64-bit two's complement integer format.  The conversion is
1519
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1520
| Arithmetic, except that the conversion is always rounded toward zero.  If
1521
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1522
| conversion overflows, the largest integer with the same sign as `a' is
1523
| returned.
1524
*----------------------------------------------------------------------------*/
1525

    
1526
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1527
{
1528
    flag aSign;
1529
    int16 aExp, shiftCount;
1530
    uint32_t aSig;
1531
    uint64_t aSig64;
1532
    int64 z;
1533
    a = float32_squash_input_denormal(a STATUS_VAR);
1534

    
1535
    aSig = extractFloat32Frac( a );
1536
    aExp = extractFloat32Exp( a );
1537
    aSign = extractFloat32Sign( a );
1538
    shiftCount = aExp - 0xBE;
1539
    if ( 0 <= shiftCount ) {
1540
        if ( float32_val(a) != 0xDF000000 ) {
1541
            float_raise( float_flag_invalid STATUS_VAR);
1542
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1543
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1544
            }
1545
        }
1546
        return (int64_t) LIT64( 0x8000000000000000 );
1547
    }
1548
    else if ( aExp <= 0x7E ) {
1549
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1550
        return 0;
1551
    }
1552
    aSig64 = aSig | 0x00800000;
1553
    aSig64 <<= 40;
1554
    z = aSig64>>( - shiftCount );
1555
    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1556
        STATUS(float_exception_flags) |= float_flag_inexact;
1557
    }
1558
    if ( aSign ) z = - z;
1559
    return z;
1560

    
1561
}
1562

    
1563
/*----------------------------------------------------------------------------
1564
| Returns the result of converting the single-precision floating-point value
1565
| `a' to the double-precision floating-point format.  The conversion is
1566
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1567
| Arithmetic.
1568
*----------------------------------------------------------------------------*/
1569

    
1570
float64 float32_to_float64( float32 a STATUS_PARAM )
1571
{
1572
    flag aSign;
1573
    int16 aExp;
1574
    uint32_t aSig;
1575
    a = float32_squash_input_denormal(a STATUS_VAR);
1576

    
1577
    aSig = extractFloat32Frac( a );
1578
    aExp = extractFloat32Exp( a );
1579
    aSign = extractFloat32Sign( a );
1580
    if ( aExp == 0xFF ) {
1581
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1582
        return packFloat64( aSign, 0x7FF, 0 );
1583
    }
1584
    if ( aExp == 0 ) {
1585
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1586
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587
        --aExp;
1588
    }
1589
    return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1590

    
1591
}
1592

    
1593
#ifdef FLOATX80
1594

    
1595
/*----------------------------------------------------------------------------
1596
| Returns the result of converting the single-precision floating-point value
1597
| `a' to the extended double-precision floating-point format.  The conversion
1598
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1599
| Arithmetic.
1600
*----------------------------------------------------------------------------*/
1601

    
1602
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1603
{
1604
    flag aSign;
1605
    int16 aExp;
1606
    uint32_t aSig;
1607

    
1608
    a = float32_squash_input_denormal(a STATUS_VAR);
1609
    aSig = extractFloat32Frac( a );
1610
    aExp = extractFloat32Exp( a );
1611
    aSign = extractFloat32Sign( a );
1612
    if ( aExp == 0xFF ) {
1613
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1614
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1615
    }
1616
    if ( aExp == 0 ) {
1617
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1618
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1619
    }
1620
    aSig |= 0x00800000;
1621
    return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1622

    
1623
}
1624

    
1625
#endif
1626

    
1627
#ifdef FLOAT128
1628

    
1629
/*----------------------------------------------------------------------------
1630
| Returns the result of converting the single-precision floating-point value
1631
| `a' to the double-precision floating-point format.  The conversion is
1632
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1633
| Arithmetic.
1634
*----------------------------------------------------------------------------*/
1635

    
1636
float128 float32_to_float128( float32 a STATUS_PARAM )
1637
{
1638
    flag aSign;
1639
    int16 aExp;
1640
    uint32_t aSig;
1641

    
1642
    a = float32_squash_input_denormal(a STATUS_VAR);
1643
    aSig = extractFloat32Frac( a );
1644
    aExp = extractFloat32Exp( a );
1645
    aSign = extractFloat32Sign( a );
1646
    if ( aExp == 0xFF ) {
1647
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1648
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1649
    }
1650
    if ( aExp == 0 ) {
1651
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1652
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1653
        --aExp;
1654
    }
1655
    return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1656

    
1657
}
1658

    
1659
#endif
1660

    
1661
/*----------------------------------------------------------------------------
1662
| Rounds the single-precision floating-point value `a' to an integer, and
1663
| returns the result as a single-precision floating-point value.  The
1664
| operation is performed according to the IEC/IEEE Standard for Binary
1665
| Floating-Point Arithmetic.
1666
*----------------------------------------------------------------------------*/
1667

    
1668
float32 float32_round_to_int( float32 a STATUS_PARAM)
1669
{
1670
    flag aSign;
1671
    int16 aExp;
1672
    uint32_t lastBitMask, roundBitsMask;
1673
    int8 roundingMode;
1674
    uint32_t z;
1675
    a = float32_squash_input_denormal(a STATUS_VAR);
1676

    
1677
    aExp = extractFloat32Exp( a );
1678
    if ( 0x96 <= aExp ) {
1679
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1680
            return propagateFloat32NaN( a, a STATUS_VAR );
1681
        }
1682
        return a;
1683
    }
1684
    if ( aExp <= 0x7E ) {
1685
        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1686
        STATUS(float_exception_flags) |= float_flag_inexact;
1687
        aSign = extractFloat32Sign( a );
1688
        switch ( STATUS(float_rounding_mode) ) {
1689
         case float_round_nearest_even:
1690
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1691
                return packFloat32( aSign, 0x7F, 0 );
1692
            }
1693
            break;
1694
         case float_round_down:
1695
            return make_float32(aSign ? 0xBF800000 : 0);
1696
         case float_round_up:
1697
            return make_float32(aSign ? 0x80000000 : 0x3F800000);
1698
        }
1699
        return packFloat32( aSign, 0, 0 );
1700
    }
1701
    lastBitMask = 1;
1702
    lastBitMask <<= 0x96 - aExp;
1703
    roundBitsMask = lastBitMask - 1;
1704
    z = float32_val(a);
1705
    roundingMode = STATUS(float_rounding_mode);
1706
    if ( roundingMode == float_round_nearest_even ) {
1707
        z += lastBitMask>>1;
1708
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1709
    }
1710
    else if ( roundingMode != float_round_to_zero ) {
1711
        if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1712
            z += roundBitsMask;
1713
        }
1714
    }
1715
    z &= ~ roundBitsMask;
1716
    if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1717
    return make_float32(z);
1718

    
1719
}
1720

    
1721
/*----------------------------------------------------------------------------
1722
| Returns the result of adding the absolute values of the single-precision
1723
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1724
| before being returned.  `zSign' is ignored if the result is a NaN.
1725
| The addition is performed according to the IEC/IEEE Standard for Binary
1726
| Floating-Point Arithmetic.
1727
*----------------------------------------------------------------------------*/
1728

    
1729
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1730
{
1731
    int16 aExp, bExp, zExp;
1732
    uint32_t aSig, bSig, zSig;
1733
    int16 expDiff;
1734

    
1735
    aSig = extractFloat32Frac( a );
1736
    aExp = extractFloat32Exp( a );
1737
    bSig = extractFloat32Frac( b );
1738
    bExp = extractFloat32Exp( b );
1739
    expDiff = aExp - bExp;
1740
    aSig <<= 6;
1741
    bSig <<= 6;
1742
    if ( 0 < expDiff ) {
1743
        if ( aExp == 0xFF ) {
1744
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745
            return a;
1746
        }
1747
        if ( bExp == 0 ) {
1748
            --expDiff;
1749
        }
1750
        else {
1751
            bSig |= 0x20000000;
1752
        }
1753
        shift32RightJamming( bSig, expDiff, &bSig );
1754
        zExp = aExp;
1755
    }
1756
    else if ( expDiff < 0 ) {
1757
        if ( bExp == 0xFF ) {
1758
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759
            return packFloat32( zSign, 0xFF, 0 );
1760
        }
1761
        if ( aExp == 0 ) {
1762
            ++expDiff;
1763
        }
1764
        else {
1765
            aSig |= 0x20000000;
1766
        }
1767
        shift32RightJamming( aSig, - expDiff, &aSig );
1768
        zExp = bExp;
1769
    }
1770
    else {
1771
        if ( aExp == 0xFF ) {
1772
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1773
            return a;
1774
        }
1775
        if ( aExp == 0 ) {
1776
            if (STATUS(flush_to_zero)) {
1777
                if (aSig | bSig) {
1778
                    float_raise(float_flag_output_denormal STATUS_VAR);
1779
                }
1780
                return packFloat32(zSign, 0, 0);
1781
            }
1782
            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1783
        }
1784
        zSig = 0x40000000 + aSig + bSig;
1785
        zExp = aExp;
1786
        goto roundAndPack;
1787
    }
1788
    aSig |= 0x20000000;
1789
    zSig = ( aSig + bSig )<<1;
1790
    --zExp;
1791
    if ( (int32_t) zSig < 0 ) {
1792
        zSig = aSig + bSig;
1793
        ++zExp;
1794
    }
1795
 roundAndPack:
1796
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1797

    
1798
}
1799

    
1800
/*----------------------------------------------------------------------------
1801
| Returns the result of subtracting the absolute values of the single-
1802
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1803
| difference is negated before being returned.  `zSign' is ignored if the
1804
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1805
| Standard for Binary Floating-Point Arithmetic.
1806
*----------------------------------------------------------------------------*/
1807

    
1808
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1809
{
1810
    int16 aExp, bExp, zExp;
1811
    uint32_t aSig, bSig, zSig;
1812
    int16 expDiff;
1813

    
1814
    aSig = extractFloat32Frac( a );
1815
    aExp = extractFloat32Exp( a );
1816
    bSig = extractFloat32Frac( b );
1817
    bExp = extractFloat32Exp( b );
1818
    expDiff = aExp - bExp;
1819
    aSig <<= 7;
1820
    bSig <<= 7;
1821
    if ( 0 < expDiff ) goto aExpBigger;
1822
    if ( expDiff < 0 ) goto bExpBigger;
1823
    if ( aExp == 0xFF ) {
1824
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825
        float_raise( float_flag_invalid STATUS_VAR);
1826
        return float32_default_nan;
1827
    }
1828
    if ( aExp == 0 ) {
1829
        aExp = 1;
1830
        bExp = 1;
1831
    }
1832
    if ( bSig < aSig ) goto aBigger;
1833
    if ( aSig < bSig ) goto bBigger;
1834
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1835
 bExpBigger:
1836
    if ( bExp == 0xFF ) {
1837
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1838
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1839
    }
1840
    if ( aExp == 0 ) {
1841
        ++expDiff;
1842
    }
1843
    else {
1844
        aSig |= 0x40000000;
1845
    }
1846
    shift32RightJamming( aSig, - expDiff, &aSig );
1847
    bSig |= 0x40000000;
1848
 bBigger:
1849
    zSig = bSig - aSig;
1850
    zExp = bExp;
1851
    zSign ^= 1;
1852
    goto normalizeRoundAndPack;
1853
 aExpBigger:
1854
    if ( aExp == 0xFF ) {
1855
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1856
        return a;
1857
    }
1858
    if ( bExp == 0 ) {
1859
        --expDiff;
1860
    }
1861
    else {
1862
        bSig |= 0x40000000;
1863
    }
1864
    shift32RightJamming( bSig, expDiff, &bSig );
1865
    aSig |= 0x40000000;
1866
 aBigger:
1867
    zSig = aSig - bSig;
1868
    zExp = aExp;
1869
 normalizeRoundAndPack:
1870
    --zExp;
1871
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1872

    
1873
}
1874

    
1875
/*----------------------------------------------------------------------------
1876
| Returns the result of adding the single-precision floating-point values `a'
1877
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1878
| Binary Floating-Point Arithmetic.
1879
*----------------------------------------------------------------------------*/
1880

    
1881
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1882
{
1883
    flag aSign, bSign;
1884
    a = float32_squash_input_denormal(a STATUS_VAR);
1885
    b = float32_squash_input_denormal(b STATUS_VAR);
1886

    
1887
    aSign = extractFloat32Sign( a );
1888
    bSign = extractFloat32Sign( b );
1889
    if ( aSign == bSign ) {
1890
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1891
    }
1892
    else {
1893
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1894
    }
1895

    
1896
}
1897

    
1898
/*----------------------------------------------------------------------------
1899
| Returns the result of subtracting the single-precision floating-point values
1900
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1901
| for Binary Floating-Point Arithmetic.
1902
*----------------------------------------------------------------------------*/
1903

    
1904
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1905
{
1906
    flag aSign, bSign;
1907
    a = float32_squash_input_denormal(a STATUS_VAR);
1908
    b = float32_squash_input_denormal(b STATUS_VAR);
1909

    
1910
    aSign = extractFloat32Sign( a );
1911
    bSign = extractFloat32Sign( b );
1912
    if ( aSign == bSign ) {
1913
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1914
    }
1915
    else {
1916
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1917
    }
1918

    
1919
}
1920

    
1921
/*----------------------------------------------------------------------------
1922
| Returns the result of multiplying the single-precision floating-point values
1923
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1924
| for Binary Floating-Point Arithmetic.
1925
*----------------------------------------------------------------------------*/
1926

    
1927
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1928
{
1929
    flag aSign, bSign, zSign;
1930
    int16 aExp, bExp, zExp;
1931
    uint32_t aSig, bSig;
1932
    uint64_t zSig64;
1933
    uint32_t zSig;
1934

    
1935
    a = float32_squash_input_denormal(a STATUS_VAR);
1936
    b = float32_squash_input_denormal(b STATUS_VAR);
1937

    
1938
    aSig = extractFloat32Frac( a );
1939
    aExp = extractFloat32Exp( a );
1940
    aSign = extractFloat32Sign( a );
1941
    bSig = extractFloat32Frac( b );
1942
    bExp = extractFloat32Exp( b );
1943
    bSign = extractFloat32Sign( b );
1944
    zSign = aSign ^ bSign;
1945
    if ( aExp == 0xFF ) {
1946
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1947
            return propagateFloat32NaN( a, b STATUS_VAR );
1948
        }
1949
        if ( ( bExp | bSig ) == 0 ) {
1950
            float_raise( float_flag_invalid STATUS_VAR);
1951
            return float32_default_nan;
1952
        }
1953
        return packFloat32( zSign, 0xFF, 0 );
1954
    }
1955
    if ( bExp == 0xFF ) {
1956
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1957
        if ( ( aExp | aSig ) == 0 ) {
1958
            float_raise( float_flag_invalid STATUS_VAR);
1959
            return float32_default_nan;
1960
        }
1961
        return packFloat32( zSign, 0xFF, 0 );
1962
    }
1963
    if ( aExp == 0 ) {
1964
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1965
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1966
    }
1967
    if ( bExp == 0 ) {
1968
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1969
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1970
    }
1971
    zExp = aExp + bExp - 0x7F;
1972
    aSig = ( aSig | 0x00800000 )<<7;
1973
    bSig = ( bSig | 0x00800000 )<<8;
1974
    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1975
    zSig = zSig64;
1976
    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1977
        zSig <<= 1;
1978
        --zExp;
1979
    }
1980
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1981

    
1982
}
1983

    
1984
/*----------------------------------------------------------------------------
1985
| Returns the result of dividing the single-precision floating-point value `a'
1986
| by the corresponding value `b'.  The operation is performed according to the
1987
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1988
*----------------------------------------------------------------------------*/
1989

    
1990
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1991
{
1992
    flag aSign, bSign, zSign;
1993
    int16 aExp, bExp, zExp;
1994
    uint32_t aSig, bSig, zSig;
1995
    a = float32_squash_input_denormal(a STATUS_VAR);
1996
    b = float32_squash_input_denormal(b STATUS_VAR);
1997

    
1998
    aSig = extractFloat32Frac( a );
1999
    aExp = extractFloat32Exp( a );
2000
    aSign = extractFloat32Sign( a );
2001
    bSig = extractFloat32Frac( b );
2002
    bExp = extractFloat32Exp( b );
2003
    bSign = extractFloat32Sign( b );
2004
    zSign = aSign ^ bSign;
2005
    if ( aExp == 0xFF ) {
2006
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2007
        if ( bExp == 0xFF ) {
2008
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2009
            float_raise( float_flag_invalid STATUS_VAR);
2010
            return float32_default_nan;
2011
        }
2012
        return packFloat32( zSign, 0xFF, 0 );
2013
    }
2014
    if ( bExp == 0xFF ) {
2015
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2016
        return packFloat32( zSign, 0, 0 );
2017
    }
2018
    if ( bExp == 0 ) {
2019
        if ( bSig == 0 ) {
2020
            if ( ( aExp | aSig ) == 0 ) {
2021
                float_raise( float_flag_invalid STATUS_VAR);
2022
                return float32_default_nan;
2023
            }
2024
            float_raise( float_flag_divbyzero STATUS_VAR);
2025
            return packFloat32( zSign, 0xFF, 0 );
2026
        }
2027
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2028
    }
2029
    if ( aExp == 0 ) {
2030
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2031
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2032
    }
2033
    zExp = aExp - bExp + 0x7D;
2034
    aSig = ( aSig | 0x00800000 )<<7;
2035
    bSig = ( bSig | 0x00800000 )<<8;
2036
    if ( bSig <= ( aSig + aSig ) ) {
2037
        aSig >>= 1;
2038
        ++zExp;
2039
    }
2040
    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2041
    if ( ( zSig & 0x3F ) == 0 ) {
2042
        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2043
    }
2044
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2045

    
2046
}
2047

    
2048
/*----------------------------------------------------------------------------
2049
| Returns the remainder of the single-precision floating-point value `a'
2050
| with respect to the corresponding value `b'.  The operation is performed
2051
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052
*----------------------------------------------------------------------------*/
2053

    
2054
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2055
{
2056
    flag aSign, zSign;
2057
    int16 aExp, bExp, expDiff;
2058
    uint32_t aSig, bSig;
2059
    uint32_t q;
2060
    uint64_t aSig64, bSig64, q64;
2061
    uint32_t alternateASig;
2062
    int32_t sigMean;
2063
    a = float32_squash_input_denormal(a STATUS_VAR);
2064
    b = float32_squash_input_denormal(b STATUS_VAR);
2065

    
2066
    aSig = extractFloat32Frac( a );
2067
    aExp = extractFloat32Exp( a );
2068
    aSign = extractFloat32Sign( a );
2069
    bSig = extractFloat32Frac( b );
2070
    bExp = extractFloat32Exp( b );
2071
    if ( aExp == 0xFF ) {
2072
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2073
            return propagateFloat32NaN( a, b STATUS_VAR );
2074
        }
2075
        float_raise( float_flag_invalid STATUS_VAR);
2076
        return float32_default_nan;
2077
    }
2078
    if ( bExp == 0xFF ) {
2079
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2080
        return a;
2081
    }
2082
    if ( bExp == 0 ) {
2083
        if ( bSig == 0 ) {
2084
            float_raise( float_flag_invalid STATUS_VAR);
2085
            return float32_default_nan;
2086
        }
2087
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2088
    }
2089
    if ( aExp == 0 ) {
2090
        if ( aSig == 0 ) return a;
2091
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2092
    }
2093
    expDiff = aExp - bExp;
2094
    aSig |= 0x00800000;
2095
    bSig |= 0x00800000;
2096
    if ( expDiff < 32 ) {
2097
        aSig <<= 8;
2098
        bSig <<= 8;
2099
        if ( expDiff < 0 ) {
2100
            if ( expDiff < -1 ) return a;
2101
            aSig >>= 1;
2102
        }
2103
        q = ( bSig <= aSig );
2104
        if ( q ) aSig -= bSig;
2105
        if ( 0 < expDiff ) {
2106
            q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2107
            q >>= 32 - expDiff;
2108
            bSig >>= 2;
2109
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2110
        }
2111
        else {
2112
            aSig >>= 2;
2113
            bSig >>= 2;
2114
        }
2115
    }
2116
    else {
2117
        if ( bSig <= aSig ) aSig -= bSig;
2118
        aSig64 = ( (uint64_t) aSig )<<40;
2119
        bSig64 = ( (uint64_t) bSig )<<40;
2120
        expDiff -= 64;
2121
        while ( 0 < expDiff ) {
2122
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2123
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2124
            aSig64 = - ( ( bSig * q64 )<<38 );
2125
            expDiff -= 62;
2126
        }
2127
        expDiff += 64;
2128
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2129
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2130
        q = q64>>( 64 - expDiff );
2131
        bSig <<= 6;
2132
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2133
    }
2134
    do {
2135
        alternateASig = aSig;
2136
        ++q;
2137
        aSig -= bSig;
2138
    } while ( 0 <= (int32_t) aSig );
2139
    sigMean = aSig + alternateASig;
2140
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2141
        aSig = alternateASig;
2142
    }
2143
    zSign = ( (int32_t) aSig < 0 );
2144
    if ( zSign ) aSig = - aSig;
2145
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2146

    
2147
}
2148

    
2149
/*----------------------------------------------------------------------------
2150
| Returns the square root of the single-precision floating-point value `a'.
2151
| The operation is performed according to the IEC/IEEE Standard for Binary
2152
| Floating-Point Arithmetic.
2153
*----------------------------------------------------------------------------*/
2154

    
2155
float32 float32_sqrt( float32 a STATUS_PARAM )
2156
{
2157
    flag aSign;
2158
    int16 aExp, zExp;
2159
    uint32_t aSig, zSig;
2160
    uint64_t rem, term;
2161
    a = float32_squash_input_denormal(a STATUS_VAR);
2162

    
2163
    aSig = extractFloat32Frac( a );
2164
    aExp = extractFloat32Exp( a );
2165
    aSign = extractFloat32Sign( a );
2166
    if ( aExp == 0xFF ) {
2167
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2168
        if ( ! aSign ) return a;
2169
        float_raise( float_flag_invalid STATUS_VAR);
2170
        return float32_default_nan;
2171
    }
2172
    if ( aSign ) {
2173
        if ( ( aExp | aSig ) == 0 ) return a;
2174
        float_raise( float_flag_invalid STATUS_VAR);
2175
        return float32_default_nan;
2176
    }
2177
    if ( aExp == 0 ) {
2178
        if ( aSig == 0 ) return float32_zero;
2179
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2180
    }
2181
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2182
    aSig = ( aSig | 0x00800000 )<<8;
2183
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2184
    if ( ( zSig & 0x7F ) <= 5 ) {
2185
        if ( zSig < 2 ) {
2186
            zSig = 0x7FFFFFFF;
2187
            goto roundAndPack;
2188
        }
2189
        aSig >>= aExp & 1;
2190
        term = ( (uint64_t) zSig ) * zSig;
2191
        rem = ( ( (uint64_t) aSig )<<32 ) - term;
2192
        while ( (int64_t) rem < 0 ) {
2193
            --zSig;
2194
            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2195
        }
2196
        zSig |= ( rem != 0 );
2197
    }
2198
    shift32RightJamming( zSig, 1, &zSig );
2199
 roundAndPack:
2200
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2201

    
2202
}
2203

    
2204
/*----------------------------------------------------------------------------
2205
| Returns the binary exponential of the single-precision floating-point value
2206
| `a'. The operation is performed according to the IEC/IEEE Standard for
2207
| Binary Floating-Point Arithmetic.
2208
|
2209
| Uses the following identities:
2210
|
2211
| 1. -------------------------------------------------------------------------
2212
|      x    x*ln(2)
2213
|     2  = e
2214
|
2215
| 2. -------------------------------------------------------------------------
2216
|                      2     3     4     5           n
2217
|      x        x     x     x     x     x           x
2218
|     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2219
|               1!    2!    3!    4!    5!          n!
2220
*----------------------------------------------------------------------------*/
2221

    
2222
static const float64 float32_exp2_coefficients[15] =
2223
{
2224
    const_float64( 0x3ff0000000000000ll ), /*  1 */
2225
    const_float64( 0x3fe0000000000000ll ), /*  2 */
2226
    const_float64( 0x3fc5555555555555ll ), /*  3 */
2227
    const_float64( 0x3fa5555555555555ll ), /*  4 */
2228
    const_float64( 0x3f81111111111111ll ), /*  5 */
2229
    const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2230
    const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2231
    const_float64( 0x3efa01a01a01a01all ), /*  8 */
2232
    const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2233
    const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2234
    const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2235
    const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2236
    const_float64( 0x3de6124613a86d09ll ), /* 13 */
2237
    const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2238
    const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2239
};
2240

    
2241
float32 float32_exp2( float32 a STATUS_PARAM )
2242
{
2243
    flag aSign;
2244
    int16 aExp;
2245
    uint32_t aSig;
2246
    float64 r, x, xn;
2247
    int i;
2248
    a = float32_squash_input_denormal(a STATUS_VAR);
2249

    
2250
    aSig = extractFloat32Frac( a );
2251
    aExp = extractFloat32Exp( a );
2252
    aSign = extractFloat32Sign( a );
2253

    
2254
    if ( aExp == 0xFF) {
2255
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2256
        return (aSign) ? float32_zero : a;
2257
    }
2258
    if (aExp == 0) {
2259
        if (aSig == 0) return float32_one;
2260
    }
2261

    
2262
    float_raise( float_flag_inexact STATUS_VAR);
2263

    
2264
    /* ******************************* */
2265
    /* using float64 for approximation */
2266
    /* ******************************* */
2267
    x = float32_to_float64(a STATUS_VAR);
2268
    x = float64_mul(x, float64_ln2 STATUS_VAR);
2269

    
2270
    xn = x;
2271
    r = float64_one;
2272
    for (i = 0 ; i < 15 ; i++) {
2273
        float64 f;
2274

    
2275
        f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2276
        r = float64_add(r, f STATUS_VAR);
2277

    
2278
        xn = float64_mul(xn, x STATUS_VAR);
2279
    }
2280

    
2281
    return float64_to_float32(r, status);
2282
}
2283

    
2284
/*----------------------------------------------------------------------------
2285
| Returns the binary log of the single-precision floating-point value `a'.
2286
| The operation is performed according to the IEC/IEEE Standard for Binary
2287
| Floating-Point Arithmetic.
2288
*----------------------------------------------------------------------------*/
2289
float32 float32_log2( float32 a STATUS_PARAM )
2290
{
2291
    flag aSign, zSign;
2292
    int16 aExp;
2293
    uint32_t aSig, zSig, i;
2294

    
2295
    a = float32_squash_input_denormal(a STATUS_VAR);
2296
    aSig = extractFloat32Frac( a );
2297
    aExp = extractFloat32Exp( a );
2298
    aSign = extractFloat32Sign( a );
2299

    
2300
    if ( aExp == 0 ) {
2301
        if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2302
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2303
    }
2304
    if ( aSign ) {
2305
        float_raise( float_flag_invalid STATUS_VAR);
2306
        return float32_default_nan;
2307
    }
2308
    if ( aExp == 0xFF ) {
2309
        if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2310
        return a;
2311
    }
2312

    
2313
    aExp -= 0x7F;
2314
    aSig |= 0x00800000;
2315
    zSign = aExp < 0;
2316
    zSig = aExp << 23;
2317

    
2318
    for (i = 1 << 22; i > 0; i >>= 1) {
2319
        aSig = ( (uint64_t)aSig * aSig ) >> 23;
2320
        if ( aSig & 0x01000000 ) {
2321
            aSig >>= 1;
2322
            zSig |= i;
2323
        }
2324
    }
2325

    
2326
    if ( zSign )
2327
        zSig = -zSig;
2328

    
2329
    return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2330
}
2331

    
2332
/*----------------------------------------------------------------------------
2333
| Returns 1 if the single-precision floating-point value `a' is equal to
2334
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2335
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2336
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2337
*----------------------------------------------------------------------------*/
2338

    
2339
int float32_eq( float32 a, float32 b STATUS_PARAM )
2340
{
2341
    uint32_t av, bv;
2342
    a = float32_squash_input_denormal(a STATUS_VAR);
2343
    b = float32_squash_input_denormal(b STATUS_VAR);
2344

    
2345
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2346
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2347
       ) {
2348
        float_raise( float_flag_invalid STATUS_VAR);
2349
        return 0;
2350
    }
2351
    av = float32_val(a);
2352
    bv = float32_val(b);
2353
    return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2354
}
2355

    
2356
/*----------------------------------------------------------------------------
2357
| Returns 1 if the single-precision floating-point value `a' is less than
2358
| or equal to the corresponding value `b', and 0 otherwise.  The invalid
2359
| exception is raised if either operand is a NaN.  The comparison is performed
2360
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361
*----------------------------------------------------------------------------*/
2362

    
2363
int float32_le( float32 a, float32 b STATUS_PARAM )
2364
{
2365
    flag aSign, bSign;
2366
    uint32_t av, bv;
2367
    a = float32_squash_input_denormal(a STATUS_VAR);
2368
    b = float32_squash_input_denormal(b STATUS_VAR);
2369

    
2370
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2371
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2372
       ) {
2373
        float_raise( float_flag_invalid STATUS_VAR);
2374
        return 0;
2375
    }
2376
    aSign = extractFloat32Sign( a );
2377
    bSign = extractFloat32Sign( b );
2378
    av = float32_val(a);
2379
    bv = float32_val(b);
2380
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2381
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2382

    
2383
}
2384

    
2385
/*----------------------------------------------------------------------------
2386
| Returns 1 if the single-precision floating-point value `a' is less than
2387
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2388
| raised if either operand is a NaN.  The comparison is performed according
2389
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2390
*----------------------------------------------------------------------------*/
2391

    
2392
int float32_lt( float32 a, float32 b STATUS_PARAM )
2393
{
2394
    flag aSign, bSign;
2395
    uint32_t av, bv;
2396
    a = float32_squash_input_denormal(a STATUS_VAR);
2397
    b = float32_squash_input_denormal(b STATUS_VAR);
2398

    
2399
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2400
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2401
       ) {
2402
        float_raise( float_flag_invalid STATUS_VAR);
2403
        return 0;
2404
    }
2405
    aSign = extractFloat32Sign( a );
2406
    bSign = extractFloat32Sign( b );
2407
    av = float32_val(a);
2408
    bv = float32_val(b);
2409
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2410
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2411

    
2412
}
2413

    
2414
/*----------------------------------------------------------------------------
2415
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2416
| be compared, and 0 otherwise.  The invalid exception is raised if either
2417
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
2418
| Standard for Binary Floating-Point Arithmetic.
2419
*----------------------------------------------------------------------------*/
2420

    
2421
int float32_unordered( float32 a, float32 b STATUS_PARAM )
2422
{
2423
    a = float32_squash_input_denormal(a STATUS_VAR);
2424
    b = float32_squash_input_denormal(b STATUS_VAR);
2425

    
2426
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2427
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2428
       ) {
2429
        float_raise( float_flag_invalid STATUS_VAR);
2430
        return 1;
2431
    }
2432
    return 0;
2433
}
2434

    
2435
/*----------------------------------------------------------------------------
2436
| Returns 1 if the single-precision floating-point value `a' is equal to
2437
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2438
| exception.  The comparison is performed according to the IEC/IEEE Standard
2439
| for Binary Floating-Point Arithmetic.
2440
*----------------------------------------------------------------------------*/
2441

    
2442
int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2443
{
2444
    a = float32_squash_input_denormal(a STATUS_VAR);
2445
    b = float32_squash_input_denormal(b STATUS_VAR);
2446

    
2447
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2448
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2449
       ) {
2450
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2451
            float_raise( float_flag_invalid STATUS_VAR);
2452
        }
2453
        return 0;
2454
    }
2455
    return ( float32_val(a) == float32_val(b) ) ||
2456
            ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2457
}
2458

    
2459
/*----------------------------------------------------------------------------
2460
| Returns 1 if the single-precision floating-point value `a' is less than or
2461
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2462
| cause an exception.  Otherwise, the comparison is performed according to the
2463
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2464
*----------------------------------------------------------------------------*/
2465

    
2466
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2467
{
2468
    flag aSign, bSign;
2469
    uint32_t av, bv;
2470
    a = float32_squash_input_denormal(a STATUS_VAR);
2471
    b = float32_squash_input_denormal(b STATUS_VAR);
2472

    
2473
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2474
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2475
       ) {
2476
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2477
            float_raise( float_flag_invalid STATUS_VAR);
2478
        }
2479
        return 0;
2480
    }
2481
    aSign = extractFloat32Sign( a );
2482
    bSign = extractFloat32Sign( b );
2483
    av = float32_val(a);
2484
    bv = float32_val(b);
2485
    if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2486
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
2487

    
2488
}
2489

    
2490
/*----------------------------------------------------------------------------
2491
| Returns 1 if the single-precision floating-point value `a' is less than
2492
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2493
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2494
| Standard for Binary Floating-Point Arithmetic.
2495
*----------------------------------------------------------------------------*/
2496

    
2497
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2498
{
2499
    flag aSign, bSign;
2500
    uint32_t av, bv;
2501
    a = float32_squash_input_denormal(a STATUS_VAR);
2502
    b = float32_squash_input_denormal(b STATUS_VAR);
2503

    
2504
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2505
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2506
       ) {
2507
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2508
            float_raise( float_flag_invalid STATUS_VAR);
2509
        }
2510
        return 0;
2511
    }
2512
    aSign = extractFloat32Sign( a );
2513
    bSign = extractFloat32Sign( b );
2514
    av = float32_val(a);
2515
    bv = float32_val(b);
2516
    if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2517
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
2518

    
2519
}
2520

    
2521
/*----------------------------------------------------------------------------
2522
| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2523
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2524
| comparison is performed according to the IEC/IEEE Standard for Binary
2525
| Floating-Point Arithmetic.
2526
*----------------------------------------------------------------------------*/
2527

    
2528
int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2529
{
2530
    a = float32_squash_input_denormal(a STATUS_VAR);
2531
    b = float32_squash_input_denormal(b STATUS_VAR);
2532

    
2533
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2534
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2535
       ) {
2536
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2537
            float_raise( float_flag_invalid STATUS_VAR);
2538
        }
2539
        return 1;
2540
    }
2541
    return 0;
2542
}
2543

    
2544
/*----------------------------------------------------------------------------
2545
| Returns the result of converting the double-precision floating-point value
2546
| `a' to the 32-bit two's complement integer format.  The conversion is
2547
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2548
| Arithmetic---which means in particular that the conversion is rounded
2549
| according to the current rounding mode.  If `a' is a NaN, the largest
2550
| positive integer is returned.  Otherwise, if the conversion overflows, the
2551
| largest integer with the same sign as `a' is returned.
2552
*----------------------------------------------------------------------------*/
2553

    
2554
int32 float64_to_int32( float64 a STATUS_PARAM )
2555
{
2556
    flag aSign;
2557
    int16 aExp, shiftCount;
2558
    uint64_t aSig;
2559
    a = float64_squash_input_denormal(a STATUS_VAR);
2560

    
2561
    aSig = extractFloat64Frac( a );
2562
    aExp = extractFloat64Exp( a );
2563
    aSign = extractFloat64Sign( a );
2564
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2565
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2566
    shiftCount = 0x42C - aExp;
2567
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2568
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2569

    
2570
}
2571

    
2572
/*----------------------------------------------------------------------------
2573
| Returns the result of converting the double-precision floating-point value
2574
| `a' to the 32-bit two's complement integer format.  The conversion is
2575
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2576
| Arithmetic, except that the conversion is always rounded toward zero.
2577
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2578
| the conversion overflows, the largest integer with the same sign as `a' is
2579
| returned.
2580
*----------------------------------------------------------------------------*/
2581

    
2582
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2583
{
2584
    flag aSign;
2585
    int16 aExp, shiftCount;
2586
    uint64_t aSig, savedASig;
2587
    int32 z;
2588
    a = float64_squash_input_denormal(a STATUS_VAR);
2589

    
2590
    aSig = extractFloat64Frac( a );
2591
    aExp = extractFloat64Exp( a );
2592
    aSign = extractFloat64Sign( a );
2593
    if ( 0x41E < aExp ) {
2594
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2595
        goto invalid;
2596
    }
2597
    else if ( aExp < 0x3FF ) {
2598
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2599
        return 0;
2600
    }
2601
    aSig |= LIT64( 0x0010000000000000 );
2602
    shiftCount = 0x433 - aExp;
2603
    savedASig = aSig;
2604
    aSig >>= shiftCount;
2605
    z = aSig;
2606
    if ( aSign ) z = - z;
2607
    if ( ( z < 0 ) ^ aSign ) {
2608
 invalid:
2609
        float_raise( float_flag_invalid STATUS_VAR);
2610
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2611
    }
2612
    if ( ( aSig<<shiftCount ) != savedASig ) {
2613
        STATUS(float_exception_flags) |= float_flag_inexact;
2614
    }
2615
    return z;
2616

    
2617
}
2618

    
2619
/*----------------------------------------------------------------------------
2620
| Returns the result of converting the double-precision floating-point value
2621
| `a' to the 16-bit two's complement integer format.  The conversion is
2622
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2623
| Arithmetic, except that the conversion is always rounded toward zero.
2624
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2625
| the conversion overflows, the largest integer with the same sign as `a' is
2626
| returned.
2627
*----------------------------------------------------------------------------*/
2628

    
2629
int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2630
{
2631
    flag aSign;
2632
    int16 aExp, shiftCount;
2633
    uint64_t aSig, savedASig;
2634
    int32 z;
2635

    
2636
    aSig = extractFloat64Frac( a );
2637
    aExp = extractFloat64Exp( a );
2638
    aSign = extractFloat64Sign( a );
2639
    if ( 0x40E < aExp ) {
2640
        if ( ( aExp == 0x7FF ) && aSig ) {
2641
            aSign = 0;
2642
        }
2643
        goto invalid;
2644
    }
2645
    else if ( aExp < 0x3FF ) {
2646
        if ( aExp || aSig ) {
2647
            STATUS(float_exception_flags) |= float_flag_inexact;
2648
        }
2649
        return 0;
2650
    }
2651
    aSig |= LIT64( 0x0010000000000000 );
2652
    shiftCount = 0x433 - aExp;
2653
    savedASig = aSig;
2654
    aSig >>= shiftCount;
2655
    z = aSig;
2656
    if ( aSign ) {
2657
        z = - z;
2658
    }
2659
    if ( ( (int16_t)z < 0 ) ^ aSign ) {
2660
 invalid:
2661
        float_raise( float_flag_invalid STATUS_VAR);
2662
        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2663
    }
2664
    if ( ( aSig<<shiftCount ) != savedASig ) {
2665
        STATUS(float_exception_flags) |= float_flag_inexact;
2666
    }
2667
    return z;
2668
}
2669

    
2670
/*----------------------------------------------------------------------------
2671
| Returns the result of converting the double-precision floating-point value
2672
| `a' to the 64-bit two's complement integer format.  The conversion is
2673
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2674
| Arithmetic---which means in particular that the conversion is rounded
2675
| according to the current rounding mode.  If `a' is a NaN, the largest
2676
| positive integer is returned.  Otherwise, if the conversion overflows, the
2677
| largest integer with the same sign as `a' is returned.
2678
*----------------------------------------------------------------------------*/
2679

    
2680
int64 float64_to_int64( float64 a STATUS_PARAM )
2681
{
2682
    flag aSign;
2683
    int16 aExp, shiftCount;
2684
    uint64_t aSig, aSigExtra;
2685
    a = float64_squash_input_denormal(a STATUS_VAR);
2686

    
2687
    aSig = extractFloat64Frac( a );
2688
    aExp = extractFloat64Exp( a );
2689
    aSign = extractFloat64Sign( a );
2690
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2691
    shiftCount = 0x433 - aExp;
2692
    if ( shiftCount <= 0 ) {
2693
        if ( 0x43E < aExp ) {
2694
            float_raise( float_flag_invalid STATUS_VAR);
2695
            if (    ! aSign
2696
                 || (    ( aExp == 0x7FF )
2697
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2698
               ) {
2699
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2700
            }
2701
            return (int64_t) LIT64( 0x8000000000000000 );
2702
        }
2703
        aSigExtra = 0;
2704
        aSig <<= - shiftCount;
2705
    }
2706
    else {
2707
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2708
    }
2709
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2710

    
2711
}
2712

    
2713
/*----------------------------------------------------------------------------
2714
| Returns the result of converting the double-precision floating-point value
2715
| `a' to the 64-bit two's complement integer format.  The conversion is
2716
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2717
| Arithmetic, except that the conversion is always rounded toward zero.
2718
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2719
| the conversion overflows, the largest integer with the same sign as `a' is
2720
| returned.
2721
*----------------------------------------------------------------------------*/
2722

    
2723
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2724
{
2725
    flag aSign;
2726
    int16 aExp, shiftCount;
2727
    uint64_t aSig;
2728
    int64 z;
2729
    a = float64_squash_input_denormal(a STATUS_VAR);
2730

    
2731
    aSig = extractFloat64Frac( a );
2732
    aExp = extractFloat64Exp( a );
2733
    aSign = extractFloat64Sign( a );
2734
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2735
    shiftCount = aExp - 0x433;
2736
    if ( 0 <= shiftCount ) {
2737
        if ( 0x43E <= aExp ) {
2738
            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2739
                float_raise( float_flag_invalid STATUS_VAR);
2740
                if (    ! aSign
2741
                     || (    ( aExp == 0x7FF )
2742
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2743
                   ) {
2744
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2745
                }
2746
            }
2747
            return (int64_t) LIT64( 0x8000000000000000 );
2748
        }
2749
        z = aSig<<shiftCount;
2750
    }
2751
    else {
2752
        if ( aExp < 0x3FE ) {
2753
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2754
            return 0;
2755
        }
2756
        z = aSig>>( - shiftCount );
2757
        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2758
            STATUS(float_exception_flags) |= float_flag_inexact;
2759
        }
2760
    }
2761
    if ( aSign ) z = - z;
2762
    return z;
2763

    
2764
}
2765

    
2766
/*----------------------------------------------------------------------------
2767
| Returns the result of converting the double-precision floating-point value
2768
| `a' to the single-precision floating-point format.  The conversion is
2769
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2770
| Arithmetic.
2771
*----------------------------------------------------------------------------*/
2772

    
2773
float32 float64_to_float32( float64 a STATUS_PARAM )
2774
{
2775
    flag aSign;
2776
    int16 aExp;
2777
    uint64_t aSig;
2778
    uint32_t zSig;
2779
    a = float64_squash_input_denormal(a STATUS_VAR);
2780

    
2781
    aSig = extractFloat64Frac( a );
2782
    aExp = extractFloat64Exp( a );
2783
    aSign = extractFloat64Sign( a );
2784
    if ( aExp == 0x7FF ) {
2785
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2786
        return packFloat32( aSign, 0xFF, 0 );
2787
    }
2788
    shift64RightJamming( aSig, 22, &aSig );
2789
    zSig = aSig;
2790
    if ( aExp || zSig ) {
2791
        zSig |= 0x40000000;
2792
        aExp -= 0x381;
2793
    }
2794
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2795

    
2796
}
2797

    
2798

    
2799
/*----------------------------------------------------------------------------
2800
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2801
| half-precision floating-point value, returning the result.  After being
2802
| shifted into the proper positions, the three fields are simply added
2803
| together to form the result.  This means that any integer portion of `zSig'
2804
| will be added into the exponent.  Since a properly normalized significand
2805
| will have an integer portion equal to 1, the `zExp' input should be 1 less
2806
| than the desired result exponent whenever `zSig' is a complete, normalized
2807
| significand.
2808
*----------------------------------------------------------------------------*/
2809
static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2810
{
2811
    return make_float16(
2812
        (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2813
}
2814

    
2815
/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2816
   The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2817

    
2818
float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2819
{
2820
    flag aSign;
2821
    int16 aExp;
2822
    uint32_t aSig;
2823

    
2824
    aSign = extractFloat16Sign(a);
2825
    aExp = extractFloat16Exp(a);
2826
    aSig = extractFloat16Frac(a);
2827

    
2828
    if (aExp == 0x1f && ieee) {
2829
        if (aSig) {
2830
            return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2831
        }
2832
        return packFloat32(aSign, 0xff, aSig << 13);
2833
    }
2834
    if (aExp == 0) {
2835
        int8 shiftCount;
2836

    
2837
        if (aSig == 0) {
2838
            return packFloat32(aSign, 0, 0);
2839
        }
2840

    
2841
        shiftCount = countLeadingZeros32( aSig ) - 21;
2842
        aSig = aSig << shiftCount;
2843
        aExp = -shiftCount;
2844
    }
2845
    return packFloat32( aSign, aExp + 0x70, aSig << 13);
2846
}
2847

    
2848
float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2849
{
2850
    flag aSign;
2851
    int16 aExp;
2852
    uint32_t aSig;
2853
    uint32_t mask;
2854
    uint32_t increment;
2855
    int8 roundingMode;
2856
    a = float32_squash_input_denormal(a STATUS_VAR);
2857

    
2858
    aSig = extractFloat32Frac( a );
2859
    aExp = extractFloat32Exp( a );
2860
    aSign = extractFloat32Sign( a );
2861
    if ( aExp == 0xFF ) {
2862
        if (aSig) {
2863
            /* Input is a NaN */
2864
            float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2865
            if (!ieee) {
2866
                return packFloat16(aSign, 0, 0);
2867
            }
2868
            return r;
2869
        }
2870
        /* Infinity */
2871
        if (!ieee) {
2872
            float_raise(float_flag_invalid STATUS_VAR);
2873
            return packFloat16(aSign, 0x1f, 0x3ff);
2874
        }
2875
        return packFloat16(aSign, 0x1f, 0);
2876
    }
2877
    if (aExp == 0 && aSig == 0) {
2878
        return packFloat16(aSign, 0, 0);
2879
    }
2880
    /* Decimal point between bits 22 and 23.  */
2881
    aSig |= 0x00800000;
2882
    aExp -= 0x7f;
2883
    if (aExp < -14) {
2884
        mask = 0x00ffffff;
2885
        if (aExp >= -24) {
2886
            mask >>= 25 + aExp;
2887
        }
2888
    } else {
2889
        mask = 0x00001fff;
2890
    }
2891
    if (aSig & mask) {
2892
        float_raise( float_flag_underflow STATUS_VAR );
2893
        roundingMode = STATUS(float_rounding_mode);
2894
        switch (roundingMode) {
2895
        case float_round_nearest_even:
2896
            increment = (mask + 1) >> 1;
2897
            if ((aSig & mask) == increment) {
2898
                increment = aSig & (increment << 1);
2899
            }
2900
            break;
2901
        case float_round_up:
2902
            increment = aSign ? 0 : mask;
2903
            break;
2904
        case float_round_down:
2905
            increment = aSign ? mask : 0;
2906
            break;
2907
        default: /* round_to_zero */
2908
            increment = 0;
2909
            break;
2910
        }
2911
        aSig += increment;
2912
        if (aSig >= 0x01000000) {
2913
            aSig >>= 1;
2914
            aExp++;
2915
        }
2916
    } else if (aExp < -14
2917
          && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2918
        float_raise( float_flag_underflow STATUS_VAR);
2919
    }
2920

    
2921
    if (ieee) {
2922
        if (aExp > 15) {
2923
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2924
            return packFloat16(aSign, 0x1f, 0);
2925
        }
2926
    } else {
2927
        if (aExp > 16) {
2928
            float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2929
            return packFloat16(aSign, 0x1f, 0x3ff);
2930
        }
2931
    }
2932
    if (aExp < -24) {
2933
        return packFloat16(aSign, 0, 0);
2934
    }
2935
    if (aExp < -14) {
2936
        aSig >>= -14 - aExp;
2937
        aExp = -14;
2938
    }
2939
    return packFloat16(aSign, aExp + 14, aSig >> 13);
2940
}
2941

    
2942
#ifdef FLOATX80
2943

    
2944
/*----------------------------------------------------------------------------
2945
| Returns the result of converting the double-precision floating-point value
2946
| `a' to the extended double-precision floating-point format.  The conversion
2947
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2948
| Arithmetic.
2949
*----------------------------------------------------------------------------*/
2950

    
2951
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2952
{
2953
    flag aSign;
2954
    int16 aExp;
2955
    uint64_t aSig;
2956

    
2957
    a = float64_squash_input_denormal(a STATUS_VAR);
2958
    aSig = extractFloat64Frac( a );
2959
    aExp = extractFloat64Exp( a );
2960
    aSign = extractFloat64Sign( a );
2961
    if ( aExp == 0x7FF ) {
2962
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2963
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2964
    }
2965
    if ( aExp == 0 ) {
2966
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2967
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968
    }
2969
    return
2970
        packFloatx80(
2971
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2972

    
2973
}
2974

    
2975
#endif
2976

    
2977
#ifdef FLOAT128
2978

    
2979
/*----------------------------------------------------------------------------
2980
| Returns the result of converting the double-precision floating-point value
2981
| `a' to the quadruple-precision floating-point format.  The conversion is
2982
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2983
| Arithmetic.
2984
*----------------------------------------------------------------------------*/
2985

    
2986
float128 float64_to_float128( float64 a STATUS_PARAM )
2987
{
2988
    flag aSign;
2989
    int16 aExp;
2990
    uint64_t aSig, zSig0, zSig1;
2991

    
2992
    a = float64_squash_input_denormal(a STATUS_VAR);
2993
    aSig = extractFloat64Frac( a );
2994
    aExp = extractFloat64Exp( a );
2995
    aSign = extractFloat64Sign( a );
2996
    if ( aExp == 0x7FF ) {
2997
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2998
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2999
    }
3000
    if ( aExp == 0 ) {
3001
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3002
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3003
        --aExp;
3004
    }
3005
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3006
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3007

    
3008
}
3009

    
3010
#endif
3011

    
3012
/*----------------------------------------------------------------------------
3013
| Rounds the double-precision floating-point value `a' to an integer, and
3014
| returns the result as a double-precision floating-point value.  The
3015
| operation is performed according to the IEC/IEEE Standard for Binary
3016
| Floating-Point Arithmetic.
3017
*----------------------------------------------------------------------------*/
3018

    
3019
float64 float64_round_to_int( float64 a STATUS_PARAM )
3020
{
3021
    flag aSign;
3022
    int16 aExp;
3023
    uint64_t lastBitMask, roundBitsMask;
3024
    int8 roundingMode;
3025
    uint64_t z;
3026
    a = float64_squash_input_denormal(a STATUS_VAR);
3027

    
3028
    aExp = extractFloat64Exp( a );
3029
    if ( 0x433 <= aExp ) {
3030
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3031
            return propagateFloat64NaN( a, a STATUS_VAR );
3032
        }
3033
        return a;
3034
    }
3035
    if ( aExp < 0x3FF ) {
3036
        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3037
        STATUS(float_exception_flags) |= float_flag_inexact;
3038
        aSign = extractFloat64Sign( a );
3039
        switch ( STATUS(float_rounding_mode) ) {
3040
         case float_round_nearest_even:
3041
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3042
                return packFloat64( aSign, 0x3FF, 0 );
3043
            }
3044
            break;
3045
         case float_round_down:
3046
            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3047
         case float_round_up:
3048
            return make_float64(
3049
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3050
        }
3051
        return packFloat64( aSign, 0, 0 );
3052
    }
3053
    lastBitMask = 1;
3054
    lastBitMask <<= 0x433 - aExp;
3055
    roundBitsMask = lastBitMask - 1;
3056
    z = float64_val(a);
3057
    roundingMode = STATUS(float_rounding_mode);
3058
    if ( roundingMode == float_round_nearest_even ) {
3059
        z += lastBitMask>>1;
3060
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3061
    }
3062
    else if ( roundingMode != float_round_to_zero ) {
3063
        if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3064
            z += roundBitsMask;
3065
        }
3066
    }
3067
    z &= ~ roundBitsMask;
3068
    if ( z != float64_val(a) )
3069
        STATUS(float_exception_flags) |= float_flag_inexact;
3070
    return make_float64(z);
3071

    
3072
}
3073

    
3074
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3075
{
3076
    int oldmode;
3077
    float64 res;
3078
    oldmode = STATUS(float_rounding_mode);
3079
    STATUS(float_rounding_mode) = float_round_to_zero;
3080
    res = float64_round_to_int(a STATUS_VAR);
3081
    STATUS(float_rounding_mode) = oldmode;
3082
    return res;
3083
}
3084

    
3085
/*----------------------------------------------------------------------------
3086
| Returns the result of adding the absolute values of the double-precision
3087
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3088
| before being returned.  `zSign' is ignored if the result is a NaN.
3089
| The addition is performed according to the IEC/IEEE Standard for Binary
3090
| Floating-Point Arithmetic.
3091
*----------------------------------------------------------------------------*/
3092

    
3093
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3094
{
3095
    int16 aExp, bExp, zExp;
3096
    uint64_t aSig, bSig, zSig;
3097
    int16 expDiff;
3098

    
3099
    aSig = extractFloat64Frac( a );
3100
    aExp = extractFloat64Exp( a );
3101
    bSig = extractFloat64Frac( b );
3102
    bExp = extractFloat64Exp( b );
3103
    expDiff = aExp - bExp;
3104
    aSig <<= 9;
3105
    bSig <<= 9;
3106
    if ( 0 < expDiff ) {
3107
        if ( aExp == 0x7FF ) {
3108
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3109
            return a;
3110
        }
3111
        if ( bExp == 0 ) {
3112
            --expDiff;
3113
        }
3114
        else {
3115
            bSig |= LIT64( 0x2000000000000000 );
3116
        }
3117
        shift64RightJamming( bSig, expDiff, &bSig );
3118
        zExp = aExp;
3119
    }
3120
    else if ( expDiff < 0 ) {
3121
        if ( bExp == 0x7FF ) {
3122
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123
            return packFloat64( zSign, 0x7FF, 0 );
3124
        }
3125
        if ( aExp == 0 ) {
3126
            ++expDiff;
3127
        }
3128
        else {
3129
            aSig |= LIT64( 0x2000000000000000 );
3130
        }
3131
        shift64RightJamming( aSig, - expDiff, &aSig );
3132
        zExp = bExp;
3133
    }
3134
    else {
3135
        if ( aExp == 0x7FF ) {
3136
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3137
            return a;
3138
        }
3139
        if ( aExp == 0 ) {
3140
            if (STATUS(flush_to_zero)) {
3141
                if (aSig | bSig) {
3142
                    float_raise(float_flag_output_denormal STATUS_VAR);
3143
                }
3144
                return packFloat64(zSign, 0, 0);
3145
            }
3146
            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3147
        }
3148
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3149
        zExp = aExp;
3150
        goto roundAndPack;
3151
    }
3152
    aSig |= LIT64( 0x2000000000000000 );
3153
    zSig = ( aSig + bSig )<<1;
3154
    --zExp;
3155
    if ( (int64_t) zSig < 0 ) {
3156
        zSig = aSig + bSig;
3157
        ++zExp;
3158
    }
3159
 roundAndPack:
3160
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3161

    
3162
}
3163

    
3164
/*----------------------------------------------------------------------------
3165
| Returns the result of subtracting the absolute values of the double-
3166
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
3167
| difference is negated before being returned.  `zSign' is ignored if the
3168
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3169
| Standard for Binary Floating-Point Arithmetic.
3170
*----------------------------------------------------------------------------*/
3171

    
3172
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3173
{
3174
    int16 aExp, bExp, zExp;
3175
    uint64_t aSig, bSig, zSig;
3176
    int16 expDiff;
3177

    
3178
    aSig = extractFloat64Frac( a );
3179
    aExp = extractFloat64Exp( a );
3180
    bSig = extractFloat64Frac( b );
3181
    bExp = extractFloat64Exp( b );
3182
    expDiff = aExp - bExp;
3183
    aSig <<= 10;
3184
    bSig <<= 10;
3185
    if ( 0 < expDiff ) goto aExpBigger;
3186
    if ( expDiff < 0 ) goto bExpBigger;
3187
    if ( aExp == 0x7FF ) {
3188
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3189
        float_raise( float_flag_invalid STATUS_VAR);
3190
        return float64_default_nan;
3191
    }
3192
    if ( aExp == 0 ) {
3193
        aExp = 1;
3194
        bExp = 1;
3195
    }
3196
    if ( bSig < aSig ) goto aBigger;
3197
    if ( aSig < bSig ) goto bBigger;
3198
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3199
 bExpBigger:
3200
    if ( bExp == 0x7FF ) {
3201
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3202
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
3203
    }
3204
    if ( aExp == 0 ) {
3205
        ++expDiff;
3206
    }
3207
    else {
3208
        aSig |= LIT64( 0x4000000000000000 );
3209
    }
3210
    shift64RightJamming( aSig, - expDiff, &aSig );
3211
    bSig |= LIT64( 0x4000000000000000 );
3212
 bBigger:
3213
    zSig = bSig - aSig;
3214
    zExp = bExp;
3215
    zSign ^= 1;
3216
    goto normalizeRoundAndPack;
3217
 aExpBigger:
3218
    if ( aExp == 0x7FF ) {
3219
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3220
        return a;
3221
    }
3222
    if ( bExp == 0 ) {
3223
        --expDiff;
3224
    }
3225
    else {
3226
        bSig |= LIT64( 0x4000000000000000 );
3227
    }
3228
    shift64RightJamming( bSig, expDiff, &bSig );
3229
    aSig |= LIT64( 0x4000000000000000 );
3230
 aBigger:
3231
    zSig = aSig - bSig;
3232
    zExp = aExp;
3233
 normalizeRoundAndPack:
3234
    --zExp;
3235
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3236

    
3237
}
3238

    
3239
/*----------------------------------------------------------------------------
3240
| Returns the result of adding the double-precision floating-point values `a'
3241
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
3242
| Binary Floating-Point Arithmetic.
3243
*----------------------------------------------------------------------------*/
3244

    
3245
float64 float64_add( float64 a, float64 b STATUS_PARAM )
3246
{
3247
    flag aSign, bSign;
3248
    a = float64_squash_input_denormal(a STATUS_VAR);
3249
    b = float64_squash_input_denormal(b STATUS_VAR);
3250

    
3251
    aSign = extractFloat64Sign( a );
3252
    bSign = extractFloat64Sign( b );
3253
    if ( aSign == bSign ) {
3254
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3255
    }
3256
    else {
3257
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3258
    }
3259

    
3260
}
3261

    
3262
/*----------------------------------------------------------------------------
3263
| Returns the result of subtracting the double-precision floating-point values
3264
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3265
| for Binary Floating-Point Arithmetic.
3266
*----------------------------------------------------------------------------*/
3267

    
3268
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3269
{
3270
    flag aSign, bSign;
3271
    a = float64_squash_input_denormal(a STATUS_VAR);
3272
    b = float64_squash_input_denormal(b STATUS_VAR);
3273

    
3274
    aSign = extractFloat64Sign( a );
3275
    bSign = extractFloat64Sign( b );
3276
    if ( aSign == bSign ) {
3277
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
3278
    }
3279
    else {
3280
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
3281
    }
3282

    
3283
}
3284

    
3285
/*----------------------------------------------------------------------------
3286
| Returns the result of multiplying the double-precision floating-point values
3287
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3288
| for Binary Floating-Point Arithmetic.
3289
*----------------------------------------------------------------------------*/
3290

    
3291
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3292
{
3293
    flag aSign, bSign, zSign;
3294
    int16 aExp, bExp, zExp;
3295
    uint64_t aSig, bSig, zSig0, zSig1;
3296

    
3297
    a = float64_squash_input_denormal(a STATUS_VAR);
3298
    b = float64_squash_input_denormal(b STATUS_VAR);
3299

    
3300
    aSig = extractFloat64Frac( a );
3301
    aExp = extractFloat64Exp( a );
3302
    aSign = extractFloat64Sign( a );
3303
    bSig = extractFloat64Frac( b );
3304
    bExp = extractFloat64Exp( b );
3305
    bSign = extractFloat64Sign( b );
3306
    zSign = aSign ^ bSign;
3307
    if ( aExp == 0x7FF ) {
3308
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3309
            return propagateFloat64NaN( a, b STATUS_VAR );
3310
        }
3311
        if ( ( bExp | bSig ) == 0 ) {
3312
            float_raise( float_flag_invalid STATUS_VAR);
3313
            return float64_default_nan;
3314
        }
3315
        return packFloat64( zSign, 0x7FF, 0 );
3316
    }
3317
    if ( bExp == 0x7FF ) {
3318
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3319
        if ( ( aExp | aSig ) == 0 ) {
3320
            float_raise( float_flag_invalid STATUS_VAR);
3321
            return float64_default_nan;
3322
        }
3323
        return packFloat64( zSign, 0x7FF, 0 );
3324
    }
3325
    if ( aExp == 0 ) {
3326
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3327
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3328
    }
3329
    if ( bExp == 0 ) {
3330
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3331
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3332
    }
3333
    zExp = aExp + bExp - 0x3FF;
3334
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3335
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3336
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3337
    zSig0 |= ( zSig1 != 0 );
3338
    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3339
        zSig0 <<= 1;
3340
        --zExp;
3341
    }
3342
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3343

    
3344
}
3345

    
3346
/*----------------------------------------------------------------------------
3347
| Returns the result of dividing the double-precision floating-point value `a'
3348
| by the corresponding value `b'.  The operation is performed according to
3349
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3350
*----------------------------------------------------------------------------*/
3351

    
3352
float64 float64_div( float64 a, float64 b STATUS_PARAM )
3353
{
3354
    flag aSign, bSign, zSign;
3355
    int16 aExp, bExp, zExp;
3356
    uint64_t aSig, bSig, zSig;
3357
    uint64_t rem0, rem1;
3358
    uint64_t term0, term1;
3359
    a = float64_squash_input_denormal(a STATUS_VAR);
3360
    b = float64_squash_input_denormal(b STATUS_VAR);
3361

    
3362
    aSig = extractFloat64Frac( a );
3363
    aExp = extractFloat64Exp( a );
3364
    aSign = extractFloat64Sign( a );
3365
    bSig = extractFloat64Frac( b );
3366
    bExp = extractFloat64Exp( b );
3367
    bSign = extractFloat64Sign( b );
3368
    zSign = aSign ^ bSign;
3369
    if ( aExp == 0x7FF ) {
3370
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3371
        if ( bExp == 0x7FF ) {
3372
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3373
            float_raise( float_flag_invalid STATUS_VAR);
3374
            return float64_default_nan;
3375
        }
3376
        return packFloat64( zSign, 0x7FF, 0 );
3377
    }
3378
    if ( bExp == 0x7FF ) {
3379
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3380
        return packFloat64( zSign, 0, 0 );
3381
    }
3382
    if ( bExp == 0 ) {
3383
        if ( bSig == 0 ) {
3384
            if ( ( aExp | aSig ) == 0 ) {
3385
                float_raise( float_flag_invalid STATUS_VAR);
3386
                return float64_default_nan;
3387
            }
3388
            float_raise( float_flag_divbyzero STATUS_VAR);
3389
            return packFloat64( zSign, 0x7FF, 0 );
3390
        }
3391
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3392
    }
3393
    if ( aExp == 0 ) {
3394
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3395
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3396
    }
3397
    zExp = aExp - bExp + 0x3FD;
3398
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3399
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3400
    if ( bSig <= ( aSig + aSig ) ) {
3401
        aSig >>= 1;
3402
        ++zExp;
3403
    }
3404
    zSig = estimateDiv128To64( aSig, 0, bSig );
3405
    if ( ( zSig & 0x1FF ) <= 2 ) {
3406
        mul64To128( bSig, zSig, &term0, &term1 );
3407
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3408
        while ( (int64_t) rem0 < 0 ) {
3409
            --zSig;
3410
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3411
        }
3412
        zSig |= ( rem1 != 0 );
3413
    }
3414
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3415

    
3416
}
3417

    
3418
/*----------------------------------------------------------------------------
3419
| Returns the remainder of the double-precision floating-point value `a'
3420
| with respect to the corresponding value `b'.  The operation is performed
3421
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3422
*----------------------------------------------------------------------------*/
3423

    
3424
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3425
{
3426
    flag aSign, zSign;
3427
    int16 aExp, bExp, expDiff;
3428
    uint64_t aSig, bSig;
3429
    uint64_t q, alternateASig;
3430
    int64_t sigMean;
3431

    
3432
    a = float64_squash_input_denormal(a STATUS_VAR);
3433
    b = float64_squash_input_denormal(b STATUS_VAR);
3434
    aSig = extractFloat64Frac( a );
3435
    aExp = extractFloat64Exp( a );
3436
    aSign = extractFloat64Sign( a );
3437
    bSig = extractFloat64Frac( b );
3438
    bExp = extractFloat64Exp( b );
3439
    if ( aExp == 0x7FF ) {
3440
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3441
            return propagateFloat64NaN( a, b STATUS_VAR );
3442
        }
3443
        float_raise( float_flag_invalid STATUS_VAR);
3444
        return float64_default_nan;
3445
    }
3446
    if ( bExp == 0x7FF ) {
3447
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3448
        return a;
3449
    }
3450
    if ( bExp == 0 ) {
3451
        if ( bSig == 0 ) {
3452
            float_raise( float_flag_invalid STATUS_VAR);
3453
            return float64_default_nan;
3454
        }
3455
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3456
    }
3457
    if ( aExp == 0 ) {
3458
        if ( aSig == 0 ) return a;
3459
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3460
    }
3461
    expDiff = aExp - bExp;
3462
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3463
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3464
    if ( expDiff < 0 ) {
3465
        if ( expDiff < -1 ) return a;
3466
        aSig >>= 1;
3467
    }
3468
    q = ( bSig <= aSig );
3469
    if ( q ) aSig -= bSig;
3470
    expDiff -= 64;
3471
    while ( 0 < expDiff ) {
3472
        q = estimateDiv128To64( aSig, 0, bSig );
3473
        q = ( 2 < q ) ? q - 2 : 0;
3474
        aSig = - ( ( bSig>>2 ) * q );
3475
        expDiff -= 62;
3476
    }
3477
    expDiff += 64;
3478
    if ( 0 < expDiff ) {
3479
        q = estimateDiv128To64( aSig, 0, bSig );
3480
        q = ( 2 < q ) ? q - 2 : 0;
3481
        q >>= 64 - expDiff;
3482
        bSig >>= 2;
3483
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3484
    }
3485
    else {
3486
        aSig >>= 2;
3487
        bSig >>= 2;
3488
    }
3489
    do {
3490
        alternateASig = aSig;
3491
        ++q;
3492
        aSig -= bSig;
3493
    } while ( 0 <= (int64_t) aSig );
3494
    sigMean = aSig + alternateASig;
3495
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3496
        aSig = alternateASig;
3497
    }
3498
    zSign = ( (int64_t) aSig < 0 );
3499
    if ( zSign ) aSig = - aSig;
3500
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3501

    
3502
}
3503

    
3504
/*----------------------------------------------------------------------------
3505
| Returns the square root of the double-precision floating-point value `a'.
3506
| The operation is performed according to the IEC/IEEE Standard for Binary
3507
| Floating-Point Arithmetic.
3508
*----------------------------------------------------------------------------*/
3509

    
3510
float64 float64_sqrt( float64 a STATUS_PARAM )
3511
{
3512
    flag aSign;
3513
    int16 aExp, zExp;
3514
    uint64_t aSig, zSig, doubleZSig;
3515
    uint64_t rem0, rem1, term0, term1;
3516
    a = float64_squash_input_denormal(a STATUS_VAR);
3517

    
3518
    aSig = extractFloat64Frac( a );
3519
    aExp = extractFloat64Exp( a );
3520
    aSign = extractFloat64Sign( a );
3521
    if ( aExp == 0x7FF ) {
3522
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3523
        if ( ! aSign ) return a;
3524
        float_raise( float_flag_invalid STATUS_VAR);
3525
        return float64_default_nan;
3526
    }
3527
    if ( aSign ) {
3528
        if ( ( aExp | aSig ) == 0 ) return a;
3529
        float_raise( float_flag_invalid STATUS_VAR);
3530
        return float64_default_nan;
3531
    }
3532
    if ( aExp == 0 ) {
3533
        if ( aSig == 0 ) return float64_zero;
3534
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3535
    }
3536
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3537
    aSig |= LIT64( 0x0010000000000000 );
3538
    zSig = estimateSqrt32( aExp, aSig>>21 );
3539
    aSig <<= 9 - ( aExp & 1 );
3540
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3541
    if ( ( zSig & 0x1FF ) <= 5 ) {
3542
        doubleZSig = zSig<<1;
3543
        mul64To128( zSig, zSig, &term0, &term1 );
3544
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3545
        while ( (int64_t) rem0 < 0 ) {
3546
            --zSig;
3547
            doubleZSig -= 2;
3548
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3549
        }
3550
        zSig |= ( ( rem0 | rem1 ) != 0 );
3551
    }
3552
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3553

    
3554
}
3555

    
3556
/*----------------------------------------------------------------------------
3557
| Returns the binary log of the double-precision floating-point value `a'.
3558
| The operation is performed according to the IEC/IEEE Standard for Binary
3559
| Floating-Point Arithmetic.
3560
*----------------------------------------------------------------------------*/
3561
float64 float64_log2( float64 a STATUS_PARAM )
3562
{
3563
    flag aSign, zSign;
3564
    int16 aExp;
3565
    uint64_t aSig, aSig0, aSig1, zSig, i;
3566
    a = float64_squash_input_denormal(a STATUS_VAR);
3567

    
3568
    aSig = extractFloat64Frac( a );
3569
    aExp = extractFloat64Exp( a );
3570
    aSign = extractFloat64Sign( a );
3571

    
3572
    if ( aExp == 0 ) {
3573
        if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3574
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3575
    }
3576
    if ( aSign ) {
3577
        float_raise( float_flag_invalid STATUS_VAR);
3578
        return float64_default_nan;
3579
    }
3580
    if ( aExp == 0x7FF ) {
3581
        if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3582
        return a;
3583
    }
3584

    
3585
    aExp -= 0x3FF;
3586
    aSig |= LIT64( 0x0010000000000000 );
3587
    zSign = aExp < 0;
3588
    zSig = (uint64_t)aExp << 52;
3589
    for (i = 1LL << 51; i > 0; i >>= 1) {
3590
        mul64To128( aSig, aSig, &aSig0, &aSig1 );
3591
        aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3592
        if ( aSig & LIT64( 0x0020000000000000 ) ) {
3593
            aSig >>= 1;
3594
            zSig |= i;
3595
        }
3596
    }
3597

    
3598
    if ( zSign )
3599
        zSig = -zSig;
3600
    return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3601
}
3602

    
3603
/*----------------------------------------------------------------------------
3604
| Returns 1 if the double-precision floating-point value `a' is equal to the
3605
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3606
| if either operand is a NaN.  Otherwise, the comparison is performed
3607
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3608
*----------------------------------------------------------------------------*/
3609

    
3610
int float64_eq( float64 a, float64 b STATUS_PARAM )
3611
{
3612
    uint64_t av, bv;
3613
    a = float64_squash_input_denormal(a STATUS_VAR);
3614
    b = float64_squash_input_denormal(b STATUS_VAR);
3615

    
3616
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3617
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3618
       ) {
3619
        float_raise( float_flag_invalid STATUS_VAR);
3620
        return 0;
3621
    }
3622
    av = float64_val(a);
3623
    bv = float64_val(b);
3624
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3625

    
3626
}
3627

    
3628
/*----------------------------------------------------------------------------
3629
| Returns 1 if the double-precision floating-point value `a' is less than or
3630
| equal to the corresponding value `b', and 0 otherwise.  The invalid
3631
| exception is raised if either operand is a NaN.  The comparison is performed
3632
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3633
*----------------------------------------------------------------------------*/
3634

    
3635
int float64_le( float64 a, float64 b STATUS_PARAM )
3636
{
3637
    flag aSign, bSign;
3638
    uint64_t av, bv;
3639
    a = float64_squash_input_denormal(a STATUS_VAR);
3640
    b = float64_squash_input_denormal(b STATUS_VAR);
3641

    
3642
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3643
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3644
       ) {
3645
        float_raise( float_flag_invalid STATUS_VAR);
3646
        return 0;
3647
    }
3648
    aSign = extractFloat64Sign( a );
3649
    bSign = extractFloat64Sign( b );
3650
    av = float64_val(a);
3651
    bv = float64_val(b);
3652
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3653
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3654

    
3655
}
3656

    
3657
/*----------------------------------------------------------------------------
3658
| Returns 1 if the double-precision floating-point value `a' is less than
3659
| the corresponding value `b', and 0 otherwise.  The invalid exception is
3660
| raised if either operand is a NaN.  The comparison is performed according
3661
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3662
*----------------------------------------------------------------------------*/
3663

    
3664
int float64_lt( float64 a, float64 b STATUS_PARAM )
3665
{
3666
    flag aSign, bSign;
3667
    uint64_t av, bv;
3668

    
3669
    a = float64_squash_input_denormal(a STATUS_VAR);
3670
    b = float64_squash_input_denormal(b STATUS_VAR);
3671
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3672
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3673
       ) {
3674
        float_raise( float_flag_invalid STATUS_VAR);
3675
        return 0;
3676
    }
3677
    aSign = extractFloat64Sign( a );
3678
    bSign = extractFloat64Sign( b );
3679
    av = float64_val(a);
3680
    bv = float64_val(b);
3681
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3682
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3683

    
3684
}
3685

    
3686
/*----------------------------------------------------------------------------
3687
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3688
| be compared, and 0 otherwise.  The invalid exception is raised if either
3689
| operand is a NaN.  The comparison is performed according to the IEC/IEEE
3690
| Standard for Binary Floating-Point Arithmetic.
3691
*----------------------------------------------------------------------------*/
3692

    
3693
int float64_unordered( float64 a, float64 b STATUS_PARAM )
3694
{
3695
    a = float64_squash_input_denormal(a STATUS_VAR);
3696
    b = float64_squash_input_denormal(b STATUS_VAR);
3697

    
3698
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3699
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3700
       ) {
3701
        float_raise( float_flag_invalid STATUS_VAR);
3702
        return 1;
3703
    }
3704
    return 0;
3705
}
3706

    
3707
/*----------------------------------------------------------------------------
3708
| Returns 1 if the double-precision floating-point value `a' is equal to the
3709
| corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3710
| exception.The comparison is performed according to the IEC/IEEE Standard
3711
| for Binary Floating-Point Arithmetic.
3712
*----------------------------------------------------------------------------*/
3713

    
3714
int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3715
{
3716
    uint64_t av, bv;
3717
    a = float64_squash_input_denormal(a STATUS_VAR);
3718
    b = float64_squash_input_denormal(b STATUS_VAR);
3719

    
3720
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3721
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3722
       ) {
3723
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3724
            float_raise( float_flag_invalid STATUS_VAR);
3725
        }
3726
        return 0;
3727
    }
3728
    av = float64_val(a);
3729
    bv = float64_val(b);
3730
    return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3731

    
3732
}
3733

    
3734
/*----------------------------------------------------------------------------
3735
| Returns 1 if the double-precision floating-point value `a' is less than or
3736
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3737
| cause an exception.  Otherwise, the comparison is performed according to the
3738
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739
*----------------------------------------------------------------------------*/
3740

    
3741
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3742
{
3743
    flag aSign, bSign;
3744
    uint64_t av, bv;
3745
    a = float64_squash_input_denormal(a STATUS_VAR);
3746
    b = float64_squash_input_denormal(b STATUS_VAR);
3747

    
3748
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3749
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3750
       ) {
3751
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3752
            float_raise( float_flag_invalid STATUS_VAR);
3753
        }
3754
        return 0;
3755
    }
3756
    aSign = extractFloat64Sign( a );
3757
    bSign = extractFloat64Sign( b );
3758
    av = float64_val(a);
3759
    bv = float64_val(b);
3760
    if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3761
    return ( av == bv ) || ( aSign ^ ( av < bv ) );
3762

    
3763
}
3764

    
3765
/*----------------------------------------------------------------------------
3766
| Returns 1 if the double-precision floating-point value `a' is less than
3767
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3768
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3769
| Standard for Binary Floating-Point Arithmetic.
3770
*----------------------------------------------------------------------------*/
3771

    
3772
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3773
{
3774
    flag aSign, bSign;
3775
    uint64_t av, bv;
3776
    a = float64_squash_input_denormal(a STATUS_VAR);
3777
    b = float64_squash_input_denormal(b STATUS_VAR);
3778

    
3779
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3780
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3781
       ) {
3782
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3783
            float_raise( float_flag_invalid STATUS_VAR);
3784
        }
3785
        return 0;
3786
    }
3787
    aSign = extractFloat64Sign( a );
3788
    bSign = extractFloat64Sign( b );
3789
    av = float64_val(a);
3790
    bv = float64_val(b);
3791
    if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3792
    return ( av != bv ) && ( aSign ^ ( av < bv ) );
3793

    
3794
}
3795

    
3796
/*----------------------------------------------------------------------------
3797
| Returns 1 if the double-precision floating-point values `a' and `b' cannot
3798
| be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3799
| comparison is performed according to the IEC/IEEE Standard for Binary
3800
| Floating-Point Arithmetic.
3801
*----------------------------------------------------------------------------*/
3802

    
3803
int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3804
{
3805
    a = float64_squash_input_denormal(a STATUS_VAR);
3806
    b = float64_squash_input_denormal(b STATUS_VAR);
3807

    
3808
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3809
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3810
       ) {
3811
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3812
            float_raise( float_flag_invalid STATUS_VAR);
3813
        }
3814
        return 1;
3815
    }
3816
    return 0;
3817
}
3818

    
3819
#ifdef FLOATX80
3820

    
3821
/*----------------------------------------------------------------------------
3822
| Returns the result of converting the extended double-precision floating-
3823
| point value `a' to the 32-bit two's complement integer format.  The
3824
| conversion is performed according to the IEC/IEEE Standard for Binary
3825
| Floating-Point Arithmetic---which means in particular that the conversion
3826
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3827
| largest positive integer is returned.  Otherwise, if the conversion
3828
| overflows, the largest integer with the same sign as `a' is returned.
3829
*----------------------------------------------------------------------------*/
3830

    
3831
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3832
{
3833
    flag aSign;
3834
    int32 aExp, shiftCount;
3835
    uint64_t aSig;
3836

    
3837
    aSig = extractFloatx80Frac( a );
3838
    aExp = extractFloatx80Exp( a );
3839
    aSign = extractFloatx80Sign( a );
3840
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3841
    shiftCount = 0x4037 - aExp;
3842
    if ( shiftCount <= 0 ) shiftCount = 1;
3843
    shift64RightJamming( aSig, shiftCount, &aSig );
3844
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3845

    
3846
}
3847

    
3848
/*----------------------------------------------------------------------------
3849
| Returns the result of converting the extended double-precision floating-
3850
| point value `a' to the 32-bit two's complement integer format.  The
3851
| conversion is performed according to the IEC/IEEE Standard for Binary
3852
| Floating-Point Arithmetic, except that the conversion is always rounded
3853
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3854
| Otherwise, if the conversion overflows, the largest integer with the same
3855
| sign as `a' is returned.
3856
*----------------------------------------------------------------------------*/
3857

    
3858
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3859
{
3860
    flag aSign;
3861
    int32 aExp, shiftCount;
3862
    uint64_t aSig, savedASig;
3863
    int32 z;
3864

    
3865
    aSig = extractFloatx80Frac( a );
3866
    aExp = extractFloatx80Exp( a );
3867
    aSign = extractFloatx80Sign( a );
3868
    if ( 0x401E < aExp ) {
3869
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3870
        goto invalid;
3871
    }
3872
    else if ( aExp < 0x3FFF ) {
3873
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3874
        return 0;
3875
    }
3876
    shiftCount = 0x403E - aExp;
3877
    savedASig = aSig;
3878
    aSig >>= shiftCount;
3879
    z = aSig;
3880
    if ( aSign ) z = - z;
3881
    if ( ( z < 0 ) ^ aSign ) {
3882
 invalid:
3883
        float_raise( float_flag_invalid STATUS_VAR);
3884
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3885
    }
3886
    if ( ( aSig<<shiftCount ) != savedASig ) {
3887
        STATUS(float_exception_flags) |= float_flag_inexact;
3888
    }
3889
    return z;
3890

    
3891
}
3892

    
3893
/*----------------------------------------------------------------------------
3894
| Returns the result of converting the extended double-precision floating-
3895
| point value `a' to the 64-bit two's complement integer format.  The
3896
| conversion is performed according to the IEC/IEEE Standard for Binary
3897
| Floating-Point Arithmetic---which means in particular that the conversion
3898
| is rounded according to the current rounding mode.  If `a' is a NaN,
3899
| the largest positive integer is returned.  Otherwise, if the conversion
3900
| overflows, the largest integer with the same sign as `a' is returned.
3901
*----------------------------------------------------------------------------*/
3902

    
3903
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3904
{
3905
    flag aSign;
3906
    int32 aExp, shiftCount;
3907
    uint64_t aSig, aSigExtra;
3908

    
3909
    aSig = extractFloatx80Frac( a );
3910
    aExp = extractFloatx80Exp( a );
3911
    aSign = extractFloatx80Sign( a );
3912
    shiftCount = 0x403E - aExp;
3913
    if ( shiftCount <= 0 ) {
3914
        if ( shiftCount ) {
3915
            float_raise( float_flag_invalid STATUS_VAR);
3916
            if (    ! aSign
3917
                 || (    ( aExp == 0x7FFF )
3918
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3919
               ) {
3920
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3921
            }
3922
            return (int64_t) LIT64( 0x8000000000000000 );
3923
        }
3924
        aSigExtra = 0;
3925
    }
3926
    else {
3927
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3928
    }
3929
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3930

    
3931
}
3932

    
3933
/*----------------------------------------------------------------------------
3934
| Returns the result of converting the extended double-precision floating-
3935
| point value `a' to the 64-bit two's complement integer format.  The
3936
| conversion is performed according to the IEC/IEEE Standard for Binary
3937
| Floating-Point Arithmetic, except that the conversion is always rounded
3938
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3939
| Otherwise, if the conversion overflows, the largest integer with the same
3940
| sign as `a' is returned.
3941
*----------------------------------------------------------------------------*/
3942

    
3943
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3944
{
3945
    flag aSign;
3946
    int32 aExp, shiftCount;
3947
    uint64_t aSig;
3948
    int64 z;
3949

    
3950
    aSig = extractFloatx80Frac( a );
3951
    aExp = extractFloatx80Exp( a );
3952
    aSign = extractFloatx80Sign( a );
3953
    shiftCount = aExp - 0x403E;
3954
    if ( 0 <= shiftCount ) {
3955
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3956
        if ( ( a.high != 0xC03E ) || aSig ) {
3957
            float_raise( float_flag_invalid STATUS_VAR);
3958
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3959
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3960
            }
3961
        }
3962
        return (int64_t) LIT64( 0x8000000000000000 );
3963
    }
3964
    else if ( aExp < 0x3FFF ) {
3965
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3966
        return 0;
3967
    }
3968
    z = aSig>>( - shiftCount );
3969
    if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3970
        STATUS(float_exception_flags) |= float_flag_inexact;
3971
    }
3972
    if ( aSign ) z = - z;
3973
    return z;
3974

    
3975
}
3976

    
3977
/*----------------------------------------------------------------------------
3978
| Returns the result of converting the extended double-precision floating-
3979
| point value `a' to the single-precision floating-point format.  The
3980
| conversion is performed according to the IEC/IEEE Standard for Binary
3981
| Floating-Point Arithmetic.
3982
*----------------------------------------------------------------------------*/
3983

    
3984
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3985
{
3986
    flag aSign;
3987
    int32 aExp;
3988
    uint64_t aSig;
3989

    
3990
    aSig = extractFloatx80Frac( a );
3991
    aExp = extractFloatx80Exp( a );
3992
    aSign = extractFloatx80Sign( a );
3993
    if ( aExp == 0x7FFF ) {
3994
        if ( (uint64_t) ( aSig<<1 ) ) {
3995
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3996
        }
3997
        return packFloat32( aSign, 0xFF, 0 );
3998
    }
3999
    shift64RightJamming( aSig, 33, &aSig );
4000
    if ( aExp || aSig ) aExp -= 0x3F81;
4001
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4002

    
4003
}
4004

    
4005
/*----------------------------------------------------------------------------
4006
| Returns the result of converting the extended double-precision floating-
4007
| point value `a' to the double-precision floating-point format.  The
4008
| conversion is performed according to the IEC/IEEE Standard for Binary
4009
| Floating-Point Arithmetic.
4010
*----------------------------------------------------------------------------*/
4011

    
4012
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4013
{
4014
    flag aSign;
4015
    int32 aExp;
4016
    uint64_t aSig, zSig;
4017

    
4018
    aSig = extractFloatx80Frac( a );
4019
    aExp = extractFloatx80Exp( a );
4020
    aSign = extractFloatx80Sign( a );
4021
    if ( aExp == 0x7FFF ) {
4022
        if ( (uint64_t) ( aSig<<1 ) ) {
4023
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4024
        }
4025
        return packFloat64( aSign, 0x7FF, 0 );
4026
    }
4027
    shift64RightJamming( aSig, 1, &zSig );
4028
    if ( aExp || aSig ) aExp -= 0x3C01;
4029
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4030

    
4031
}
4032

    
4033
#ifdef FLOAT128
4034

    
4035
/*----------------------------------------------------------------------------
4036
| Returns the result of converting the extended double-precision floating-
4037
| point value `a' to the quadruple-precision floating-point format.  The
4038
| conversion is performed according to the IEC/IEEE Standard for Binary
4039
| Floating-Point Arithmetic.
4040
*----------------------------------------------------------------------------*/
4041

    
4042
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4043
{
4044
    flag aSign;
4045
    int16 aExp;
4046
    uint64_t aSig, zSig0, zSig1;
4047

    
4048
    aSig = extractFloatx80Frac( a );
4049
    aExp = extractFloatx80Exp( a );
4050
    aSign = extractFloatx80Sign( a );
4051
    if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4052
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4053
    }
4054
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4055
    return packFloat128( aSign, aExp, zSig0, zSig1 );
4056

    
4057
}
4058

    
4059
#endif
4060

    
4061
/*----------------------------------------------------------------------------
4062
| Rounds the extended double-precision floating-point value `a' to an integer,
4063
| and returns the result as an extended quadruple-precision floating-point
4064
| value.  The operation is performed according to the IEC/IEEE Standard for
4065
| Binary Floating-Point Arithmetic.
4066
*----------------------------------------------------------------------------*/
4067

    
4068
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4069
{
4070
    flag aSign;
4071
    int32 aExp;
4072
    uint64_t lastBitMask, roundBitsMask;
4073
    int8 roundingMode;
4074
    floatx80 z;
4075

    
4076
    aExp = extractFloatx80Exp( a );
4077
    if ( 0x403E <= aExp ) {
4078
        if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4079
            return propagateFloatx80NaN( a, a STATUS_VAR );
4080
        }
4081
        return a;
4082
    }
4083
    if ( aExp < 0x3FFF ) {
4084
        if (    ( aExp == 0 )
4085
             && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4086
            return a;
4087
        }
4088
        STATUS(float_exception_flags) |= float_flag_inexact;
4089
        aSign = extractFloatx80Sign( a );
4090
        switch ( STATUS(float_rounding_mode) ) {
4091
         case float_round_nearest_even:
4092
            if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4093
               ) {
4094
                return
4095
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4096
            }
4097
            break;
4098
         case float_round_down:
4099
            return
4100
                  aSign ?
4101
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4102
                : packFloatx80( 0, 0, 0 );
4103
         case float_round_up:
4104
            return
4105
                  aSign ? packFloatx80( 1, 0, 0 )
4106
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4107
        }
4108
        return packFloatx80( aSign, 0, 0 );
4109
    }
4110
    lastBitMask = 1;
4111
    lastBitMask <<= 0x403E - aExp;
4112
    roundBitsMask = lastBitMask - 1;
4113
    z = a;
4114
    roundingMode = STATUS(float_rounding_mode);
4115
    if ( roundingMode == float_round_nearest_even ) {
4116
        z.low += lastBitMask>>1;
4117
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4118
    }
4119
    else if ( roundingMode != float_round_to_zero ) {
4120
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4121
            z.low += roundBitsMask;
4122
        }
4123
    }
4124
    z.low &= ~ roundBitsMask;
4125
    if ( z.low == 0 ) {
4126
        ++z.high;
4127
        z.low = LIT64( 0x8000000000000000 );
4128
    }
4129
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4130
    return z;
4131

    
4132
}
4133

    
4134
/*----------------------------------------------------------------------------
4135
| Returns the result of adding the absolute values of the extended double-
4136
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4137
| negated before being returned.  `zSign' is ignored if the result is a NaN.
4138
| The addition is performed according to the IEC/IEEE Standard for Binary
4139
| Floating-Point Arithmetic.
4140
*----------------------------------------------------------------------------*/
4141

    
4142
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4143
{
4144
    int32 aExp, bExp, zExp;
4145
    uint64_t aSig, bSig, zSig0, zSig1;
4146
    int32 expDiff;
4147

    
4148
    aSig = extractFloatx80Frac( a );
4149
    aExp = extractFloatx80Exp( a );
4150
    bSig = extractFloatx80Frac( b );
4151
    bExp = extractFloatx80Exp( b );
4152
    expDiff = aExp - bExp;
4153
    if ( 0 < expDiff ) {
4154
        if ( aExp == 0x7FFF ) {
4155
            if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4156
            return a;
4157
        }
4158
        if ( bExp == 0 ) --expDiff;
4159
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4160
        zExp = aExp;
4161
    }
4162
    else if ( expDiff < 0 ) {
4163
        if ( bExp == 0x7FFF ) {
4164
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4165
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4166
        }
4167
        if ( aExp == 0 ) ++expDiff;
4168
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4169
        zExp = bExp;
4170
    }
4171
    else {
4172
        if ( aExp == 0x7FFF ) {
4173
            if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4174
                return propagateFloatx80NaN( a, b STATUS_VAR );
4175
            }
4176
            return a;
4177
        }
4178
        zSig1 = 0;
4179
        zSig0 = aSig + bSig;
4180
        if ( aExp == 0 ) {
4181
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4182
            goto roundAndPack;
4183
        }
4184
        zExp = aExp;
4185
        goto shiftRight1;
4186
    }
4187
    zSig0 = aSig + bSig;
4188
    if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4189
 shiftRight1:
4190
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4191
    zSig0 |= LIT64( 0x8000000000000000 );
4192
    ++zExp;
4193
 roundAndPack:
4194
    return
4195
        roundAndPackFloatx80(
4196
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4197

    
4198
}
4199

    
4200
/*----------------------------------------------------------------------------
4201
| Returns the result of subtracting the absolute values of the extended
4202
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4203
| difference is negated before being returned.  `zSign' is ignored if the
4204
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4205
| Standard for Binary Floating-Point Arithmetic.
4206
*----------------------------------------------------------------------------*/
4207

    
4208
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4209
{
4210
    int32 aExp, bExp, zExp;
4211
    uint64_t aSig, bSig, zSig0, zSig1;
4212
    int32 expDiff;
4213
    floatx80 z;
4214

    
4215
    aSig = extractFloatx80Frac( a );
4216
    aExp = extractFloatx80Exp( a );
4217
    bSig = extractFloatx80Frac( b );
4218
    bExp = extractFloatx80Exp( b );
4219
    expDiff = aExp - bExp;
4220
    if ( 0 < expDiff ) goto aExpBigger;
4221
    if ( expDiff < 0 ) goto bExpBigger;
4222
    if ( aExp == 0x7FFF ) {
4223
        if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4224
            return propagateFloatx80NaN( a, b STATUS_VAR );
4225
        }
4226
        float_raise( float_flag_invalid STATUS_VAR);
4227
        z.low = floatx80_default_nan_low;
4228
        z.high = floatx80_default_nan_high;
4229
        return z;
4230
    }
4231
    if ( aExp == 0 ) {
4232
        aExp = 1;
4233
        bExp = 1;
4234
    }
4235
    zSig1 = 0;
4236
    if ( bSig < aSig ) goto aBigger;
4237
    if ( aSig < bSig ) goto bBigger;
4238
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4239
 bExpBigger:
4240
    if ( bExp == 0x7FFF ) {
4241
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4242
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4243
    }
4244
    if ( aExp == 0 ) ++expDiff;
4245
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4246
 bBigger:
4247
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4248
    zExp = bExp;
4249
    zSign ^= 1;
4250
    goto normalizeRoundAndPack;
4251
 aExpBigger:
4252
    if ( aExp == 0x7FFF ) {
4253
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4254
        return a;
4255
    }
4256
    if ( bExp == 0 ) --expDiff;
4257
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4258
 aBigger:
4259
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4260
    zExp = aExp;
4261
 normalizeRoundAndPack:
4262
    return
4263
        normalizeRoundAndPackFloatx80(
4264
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4265

    
4266
}
4267

    
4268
/*----------------------------------------------------------------------------
4269
| Returns the result of adding the extended double-precision floating-point
4270
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4271
| Standard for Binary Floating-Point Arithmetic.
4272
*----------------------------------------------------------------------------*/
4273

    
4274
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4275
{
4276
    flag aSign, bSign;
4277

    
4278
    aSign = extractFloatx80Sign( a );
4279
    bSign = extractFloatx80Sign( b );
4280
    if ( aSign == bSign ) {
4281
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4282
    }
4283
    else {
4284
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4285
    }
4286

    
4287
}
4288

    
4289
/*----------------------------------------------------------------------------
4290
| Returns the result of subtracting the extended double-precision floating-
4291
| point values `a' and `b'.  The operation is performed according to the
4292
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4293
*----------------------------------------------------------------------------*/
4294

    
4295
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4296
{
4297
    flag aSign, bSign;
4298

    
4299
    aSign = extractFloatx80Sign( a );
4300
    bSign = extractFloatx80Sign( b );
4301
    if ( aSign == bSign ) {
4302
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4303
    }
4304
    else {
4305
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4306
    }
4307

    
4308
}
4309

    
4310
/*----------------------------------------------------------------------------
4311
| Returns the result of multiplying the extended double-precision floating-
4312
| point values `a' and `b'.  The operation is performed according to the
4313
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4314
*----------------------------------------------------------------------------*/
4315

    
4316
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4317
{
4318
    flag aSign, bSign, zSign;
4319
    int32 aExp, bExp, zExp;
4320
    uint64_t aSig, bSig, zSig0, zSig1;
4321
    floatx80 z;
4322

    
4323
    aSig = extractFloatx80Frac( a );
4324
    aExp = extractFloatx80Exp( a );
4325
    aSign = extractFloatx80Sign( a );
4326
    bSig = extractFloatx80Frac( b );
4327
    bExp = extractFloatx80Exp( b );
4328
    bSign = extractFloatx80Sign( b );
4329
    zSign = aSign ^ bSign;
4330
    if ( aExp == 0x7FFF ) {
4331
        if (    (uint64_t) ( aSig<<1 )
4332
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4333
            return propagateFloatx80NaN( a, b STATUS_VAR );
4334
        }
4335
        if ( ( bExp | bSig ) == 0 ) goto invalid;
4336
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4337
    }
4338
    if ( bExp == 0x7FFF ) {
4339
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4340
        if ( ( aExp | aSig ) == 0 ) {
4341
 invalid:
4342
            float_raise( float_flag_invalid STATUS_VAR);
4343
            z.low = floatx80_default_nan_low;
4344
            z.high = floatx80_default_nan_high;
4345
            return z;
4346
        }
4347
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4348
    }
4349
    if ( aExp == 0 ) {
4350
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4351
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4352
    }
4353
    if ( bExp == 0 ) {
4354
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4355
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4356
    }
4357
    zExp = aExp + bExp - 0x3FFE;
4358
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
4359
    if ( 0 < (int64_t) zSig0 ) {
4360
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4361
        --zExp;
4362
    }
4363
    return
4364
        roundAndPackFloatx80(
4365
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4366

    
4367
}
4368

    
4369
/*----------------------------------------------------------------------------
4370
| Returns the result of dividing the extended double-precision floating-point
4371
| value `a' by the corresponding value `b'.  The operation is performed
4372
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4373
*----------------------------------------------------------------------------*/
4374

    
4375
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4376
{
4377
    flag aSign, bSign, zSign;
4378
    int32 aExp, bExp, zExp;
4379
    uint64_t aSig, bSig, zSig0, zSig1;
4380
    uint64_t rem0, rem1, rem2, term0, term1, term2;
4381
    floatx80 z;
4382

    
4383
    aSig = extractFloatx80Frac( a );
4384
    aExp = extractFloatx80Exp( a );
4385
    aSign = extractFloatx80Sign( a );
4386
    bSig = extractFloatx80Frac( b );
4387
    bExp = extractFloatx80Exp( b );
4388
    bSign = extractFloatx80Sign( b );
4389
    zSign = aSign ^ bSign;
4390
    if ( aExp == 0x7FFF ) {
4391
        if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4392
        if ( bExp == 0x7FFF ) {
4393
            if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4394
            goto invalid;
4395
        }
4396
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4397
    }
4398
    if ( bExp == 0x7FFF ) {
4399
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4400
        return packFloatx80( zSign, 0, 0 );
4401
    }
4402
    if ( bExp == 0 ) {
4403
        if ( bSig == 0 ) {
4404
            if ( ( aExp | aSig ) == 0 ) {
4405
 invalid:
4406
                float_raise( float_flag_invalid STATUS_VAR);
4407
                z.low = floatx80_default_nan_low;
4408
                z.high = floatx80_default_nan_high;
4409
                return z;
4410
            }
4411
            float_raise( float_flag_divbyzero STATUS_VAR);
4412
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4413
        }
4414
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4415
    }
4416
    if ( aExp == 0 ) {
4417
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4418
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4419
    }
4420
    zExp = aExp - bExp + 0x3FFE;
4421
    rem1 = 0;
4422
    if ( bSig <= aSig ) {
4423
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
4424
        ++zExp;
4425
    }
4426
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4427
    mul64To128( bSig, zSig0, &term0, &term1 );
4428
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4429
    while ( (int64_t) rem0 < 0 ) {
4430
        --zSig0;
4431
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4432
    }
4433
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
4434
    if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4435
        mul64To128( bSig, zSig1, &term1, &term2 );
4436
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4437
        while ( (int64_t) rem1 < 0 ) {
4438
            --zSig1;
4439
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4440
        }
4441
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
4442
    }
4443
    return
4444
        roundAndPackFloatx80(
4445
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4446

    
4447
}
4448

    
4449
/*----------------------------------------------------------------------------
4450
| Returns the remainder of the extended double-precision floating-point value
4451
| `a' with respect to the corresponding value `b'.  The operation is performed
4452
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4453
*----------------------------------------------------------------------------*/
4454

    
4455
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4456
{
4457
    flag aSign, zSign;
4458
    int32 aExp, bExp, expDiff;
4459
    uint64_t aSig0, aSig1, bSig;
4460
    uint64_t q, term0, term1, alternateASig0, alternateASig1;
4461
    floatx80 z;
4462

    
4463
    aSig0 = extractFloatx80Frac( a );
4464
    aExp = extractFloatx80Exp( a );
4465
    aSign = extractFloatx80Sign( a );
4466
    bSig = extractFloatx80Frac( b );
4467
    bExp = extractFloatx80Exp( b );
4468
    if ( aExp == 0x7FFF ) {
4469
        if (    (uint64_t) ( aSig0<<1 )
4470
             || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4471
            return propagateFloatx80NaN( a, b STATUS_VAR );
4472
        }
4473
        goto invalid;
4474
    }
4475
    if ( bExp == 0x7FFF ) {
4476
        if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4477
        return a;
4478
    }
4479
    if ( bExp == 0 ) {
4480
        if ( bSig == 0 ) {
4481
 invalid:
4482
            float_raise( float_flag_invalid STATUS_VAR);
4483
            z.low = floatx80_default_nan_low;
4484
            z.high = floatx80_default_nan_high;
4485
            return z;
4486
        }
4487
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4488
    }
4489
    if ( aExp == 0 ) {
4490
        if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4491
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4492
    }
4493
    bSig |= LIT64( 0x8000000000000000 );
4494
    zSign = aSign;
4495
    expDiff = aExp - bExp;
4496
    aSig1 = 0;
4497
    if ( expDiff < 0 ) {
4498
        if ( expDiff < -1 ) return a;
4499
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4500
        expDiff = 0;
4501
    }
4502
    q = ( bSig <= aSig0 );
4503
    if ( q ) aSig0 -= bSig;
4504
    expDiff -= 64;
4505
    while ( 0 < expDiff ) {
4506
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4507
        q = ( 2 < q ) ? q - 2 : 0;
4508
        mul64To128( bSig, q, &term0, &term1 );
4509
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4510
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4511
        expDiff -= 62;
4512
    }
4513
    expDiff += 64;
4514
    if ( 0 < expDiff ) {
4515
        q = estimateDiv128To64( aSig0, aSig1, bSig );
4516
        q = ( 2 < q ) ? q - 2 : 0;
4517
        q >>= 64 - expDiff;
4518
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4519
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4520
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4521
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
4522
            ++q;
4523
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4524
        }
4525
    }
4526
    else {
4527
        term1 = 0;
4528
        term0 = bSig;
4529
    }
4530
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4531
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4532
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4533
              && ( q & 1 ) )
4534
       ) {
4535
        aSig0 = alternateASig0;
4536
        aSig1 = alternateASig1;
4537
        zSign = ! zSign;
4538
    }
4539
    return
4540
        normalizeRoundAndPackFloatx80(
4541
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4542

    
4543
}
4544

    
4545
/*----------------------------------------------------------------------------
4546
| Returns the square root of the extended double-precision floating-point
4547
| value `a'.  The operation is performed according to the IEC/IEEE Standard
4548
| for Binary Floating-Point Arithmetic.
4549
*----------------------------------------------------------------------------*/
4550

    
4551
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4552
{
4553
    flag aSign;
4554
    int32 aExp, zExp;
4555
    uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4556
    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4557
    floatx80 z;
4558

    
4559
    aSig0 = extractFloatx80Frac( a );
4560
    aExp = extractFloatx80Exp( a );
4561
    aSign = extractFloatx80Sign( a );
4562
    if ( aExp == 0x7FFF ) {
4563
        if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4564
        if ( ! aSign ) return a;
4565
        goto invalid;
4566
    }
4567
    if ( aSign ) {
4568
        if ( ( aExp | aSig0 ) == 0 ) return a;
4569
 invalid:
4570
        float_raise( float_flag_invalid STATUS_VAR);
4571
        z.low = floatx80_default_nan_low;
4572
        z.high = floatx80_default_nan_high;
4573
        return z;
4574
    }
4575
    if ( aExp == 0 ) {
4576
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4577
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4578
    }
4579
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4580
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4581
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4582
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4583
    doubleZSig0 = zSig0<<1;
4584
    mul64To128( zSig0, zSig0, &term0, &term1 );
4585
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4586
    while ( (int64_t) rem0 < 0 ) {
4587
        --zSig0;
4588
        doubleZSig0 -= 2;
4589
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4590
    }
4591
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4592
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4593
        if ( zSig1 == 0 ) zSig1 = 1;
4594
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4595
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4596
        mul64To128( zSig1, zSig1, &term2, &term3 );
4597
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4598
        while ( (int64_t) rem1 < 0 ) {
4599
            --zSig1;
4600
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4601
            term3 |= 1;
4602
            term2 |= doubleZSig0;
4603
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4604
        }
4605
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4606
    }
4607
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4608
    zSig0 |= doubleZSig0;
4609
    return
4610
        roundAndPackFloatx80(
4611
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4612

    
4613
}
4614

    
4615
/*----------------------------------------------------------------------------
4616
| Returns 1 if the extended double-precision floating-point value `a' is equal
4617
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4618
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4619
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4620
*----------------------------------------------------------------------------*/
4621

    
4622
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4623
{
4624

    
4625
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4626
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4627
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4628
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4629
       ) {
4630
        float_raise( float_flag_invalid STATUS_VAR);
4631
        return 0;
4632
    }
4633
    return
4634
           ( a.low == b.low )
4635
        && (    ( a.high == b.high )
4636
             || (    ( a.low == 0 )
4637
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4638
           );
4639

    
4640
}
4641

    
4642
/*----------------------------------------------------------------------------
4643
| Returns 1 if the extended double-precision floating-point value `a' is
4644
| less than or equal to the corresponding value `b', and 0 otherwise.  The
4645
| invalid exception is raised if either operand is a NaN.  The comparison is
4646
| performed according to the IEC/IEEE Standard for Binary Floating-Point
4647
| Arithmetic.
4648
*----------------------------------------------------------------------------*/
4649

    
4650
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4651
{
4652
    flag aSign, bSign;
4653

    
4654
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4655
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4656
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4657
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4658
       ) {
4659
        float_raise( float_flag_invalid STATUS_VAR);
4660
        return 0;
4661
    }
4662
    aSign = extractFloatx80Sign( a );
4663
    bSign = extractFloatx80Sign( b );
4664
    if ( aSign != bSign ) {
4665
        return
4666
               aSign
4667
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4668
                 == 0 );
4669
    }
4670
    return
4671
          aSign ? le128( b.high, b.low, a.high, a.low )
4672
        : le128( a.high, a.low, b.high, b.low );
4673

    
4674
}
4675

    
4676
/*----------------------------------------------------------------------------
4677
| Returns 1 if the extended double-precision floating-point value `a' is
4678
| less than the corresponding value `b', and 0 otherwise.  The invalid
4679
| exception is raised if either operand is a NaN.  The comparison is performed
4680
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4681
*----------------------------------------------------------------------------*/
4682

    
4683
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4684
{
4685
    flag aSign, bSign;
4686

    
4687
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4688
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4689
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4690
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4691
       ) {
4692
        float_raise( float_flag_invalid STATUS_VAR);
4693
        return 0;
4694
    }
4695
    aSign = extractFloatx80Sign( a );
4696
    bSign = extractFloatx80Sign( b );
4697
    if ( aSign != bSign ) {
4698
        return
4699
               aSign
4700
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4701
                 != 0 );
4702
    }
4703
    return
4704
          aSign ? lt128( b.high, b.low, a.high, a.low )
4705
        : lt128( a.high, a.low, b.high, b.low );
4706

    
4707
}
4708

    
4709
/*----------------------------------------------------------------------------
4710
| Returns 1 if the extended double-precision floating-point values `a' and `b'
4711
| cannot be compared, and 0 otherwise.  The invalid exception is raised if
4712
| either operand is a NaN.   The comparison is performed according to the
4713
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714
*----------------------------------------------------------------------------*/
4715
int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4716
{
4717
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4718
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4719
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4720
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4721
       ) {
4722
        float_raise( float_flag_invalid STATUS_VAR);
4723
        return 1;
4724
    }
4725
    return 0;
4726
}
4727

    
4728
/*----------------------------------------------------------------------------
4729
| Returns 1 if the extended double-precision floating-point value `a' is
4730
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4731
| cause an exception.  The comparison is performed according to the IEC/IEEE
4732
| Standard for Binary Floating-Point Arithmetic.
4733
*----------------------------------------------------------------------------*/
4734

    
4735
int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4736
{
4737

    
4738
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4739
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4740
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4741
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4742
       ) {
4743
        if (    floatx80_is_signaling_nan( a )
4744
             || floatx80_is_signaling_nan( b ) ) {
4745
            float_raise( float_flag_invalid STATUS_VAR);
4746
        }
4747
        return 0;
4748
    }
4749
    return
4750
           ( a.low == b.low )
4751
        && (    ( a.high == b.high )
4752
             || (    ( a.low == 0 )
4753
                  && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4754
           );
4755

    
4756
}
4757

    
4758
/*----------------------------------------------------------------------------
4759
| Returns 1 if the extended double-precision floating-point value `a' is less
4760
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4761
| do not cause an exception.  Otherwise, the comparison is performed according
4762
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4763
*----------------------------------------------------------------------------*/
4764

    
4765
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4766
{
4767
    flag aSign, bSign;
4768

    
4769
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4770
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4771
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4772
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4773
       ) {
4774
        if (    floatx80_is_signaling_nan( a )
4775
             || floatx80_is_signaling_nan( b ) ) {
4776
            float_raise( float_flag_invalid STATUS_VAR);
4777
        }
4778
        return 0;
4779
    }
4780
    aSign = extractFloatx80Sign( a );
4781
    bSign = extractFloatx80Sign( b );
4782
    if ( aSign != bSign ) {
4783
        return
4784
               aSign
4785
            || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4786
                 == 0 );
4787
    }
4788
    return
4789
          aSign ? le128( b.high, b.low, a.high, a.low )
4790
        : le128( a.high, a.low, b.high, b.low );
4791

    
4792
}
4793

    
4794
/*----------------------------------------------------------------------------
4795
| Returns 1 if the extended double-precision floating-point value `a' is less
4796
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4797
| an exception.  Otherwise, the comparison is performed according to the
4798
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4799
*----------------------------------------------------------------------------*/
4800

    
4801
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4802
{
4803
    flag aSign, bSign;
4804

    
4805
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4806
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4807
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4808
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4809
       ) {
4810
        if (    floatx80_is_signaling_nan( a )
4811
             || floatx80_is_signaling_nan( b ) ) {
4812
            float_raise( float_flag_invalid STATUS_VAR);
4813
        }
4814
        return 0;
4815
    }
4816
    aSign = extractFloatx80Sign( a );
4817
    bSign = extractFloatx80Sign( b );
4818
    if ( aSign != bSign ) {
4819
        return
4820
               aSign
4821
            && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4822
                 != 0 );
4823
    }
4824
    return
4825
          aSign ? lt128( b.high, b.low, a.high, a.low )
4826
        : lt128( a.high, a.low, b.high, b.low );
4827

    
4828
}
4829

    
4830
/*----------------------------------------------------------------------------
4831
| Returns 1 if the extended double-precision floating-point values `a' and `b'
4832
| cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4833
| The comparison is performed according to the IEC/IEEE Standard for Binary
4834
| Floating-Point Arithmetic.
4835
*----------------------------------------------------------------------------*/
4836
int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4837
{
4838
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4839
              && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4840
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4841
              && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4842
       ) {
4843
        if (    floatx80_is_signaling_nan( a )
4844
             || floatx80_is_signaling_nan( b ) ) {
4845
            float_raise( float_flag_invalid STATUS_VAR);
4846
        }
4847
        return 1;
4848
    }
4849
    return 0;
4850
}
4851

    
4852
#endif
4853

    
4854
#ifdef FLOAT128
4855

    
4856
/*----------------------------------------------------------------------------
4857
| Returns the result of converting the quadruple-precision floating-point
4858
| value `a' to the 32-bit two's complement integer format.  The conversion
4859
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860
| Arithmetic---which means in particular that the conversion is rounded
4861
| according to the current rounding mode.  If `a' is a NaN, the largest
4862
| positive integer is returned.  Otherwise, if the conversion overflows, the
4863
| largest integer with the same sign as `a' is returned.
4864
*----------------------------------------------------------------------------*/
4865

    
4866
int32 float128_to_int32( float128 a STATUS_PARAM )
4867
{
4868
    flag aSign;
4869
    int32 aExp, shiftCount;
4870
    uint64_t aSig0, aSig1;
4871

    
4872
    aSig1 = extractFloat128Frac1( a );
4873
    aSig0 = extractFloat128Frac0( a );
4874
    aExp = extractFloat128Exp( a );
4875
    aSign = extractFloat128Sign( a );
4876
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4877
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4878
    aSig0 |= ( aSig1 != 0 );
4879
    shiftCount = 0x4028 - aExp;
4880
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4881
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4882

    
4883
}
4884

    
4885
/*----------------------------------------------------------------------------
4886
| Returns the result of converting the quadruple-precision floating-point
4887
| value `a' to the 32-bit two's complement integer format.  The conversion
4888
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4889
| Arithmetic, except that the conversion is always rounded toward zero.  If
4890
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4891
| conversion overflows, the largest integer with the same sign as `a' is
4892
| returned.
4893
*----------------------------------------------------------------------------*/
4894

    
4895
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4896
{
4897
    flag aSign;
4898
    int32 aExp, shiftCount;
4899
    uint64_t aSig0, aSig1, savedASig;
4900
    int32 z;
4901

    
4902
    aSig1 = extractFloat128Frac1( a );
4903
    aSig0 = extractFloat128Frac0( a );
4904
    aExp = extractFloat128Exp( a );
4905
    aSign = extractFloat128Sign( a );
4906
    aSig0 |= ( aSig1 != 0 );
4907
    if ( 0x401E < aExp ) {
4908
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4909
        goto invalid;
4910
    }
4911
    else if ( aExp < 0x3FFF ) {
4912
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4913
        return 0;
4914
    }
4915
    aSig0 |= LIT64( 0x0001000000000000 );
4916
    shiftCount = 0x402F - aExp;
4917
    savedASig = aSig0;
4918
    aSig0 >>= shiftCount;
4919
    z = aSig0;
4920
    if ( aSign ) z = - z;
4921
    if ( ( z < 0 ) ^ aSign ) {
4922
 invalid:
4923
        float_raise( float_flag_invalid STATUS_VAR);
4924
        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4925
    }
4926
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4927
        STATUS(float_exception_flags) |= float_flag_inexact;
4928
    }
4929
    return z;
4930

    
4931
}
4932

    
4933
/*----------------------------------------------------------------------------
4934
| Returns the result of converting the quadruple-precision floating-point
4935
| value `a' to the 64-bit two's complement integer format.  The conversion
4936
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4937
| Arithmetic---which means in particular that the conversion is rounded
4938
| according to the current rounding mode.  If `a' is a NaN, the largest
4939
| positive integer is returned.  Otherwise, if the conversion overflows, the
4940
| largest integer with the same sign as `a' is returned.
4941
*----------------------------------------------------------------------------*/
4942

    
4943
int64 float128_to_int64( float128 a STATUS_PARAM )
4944
{
4945
    flag aSign;
4946
    int32 aExp, shiftCount;
4947
    uint64_t aSig0, aSig1;
4948

    
4949
    aSig1 = extractFloat128Frac1( a );
4950
    aSig0 = extractFloat128Frac0( a );
4951
    aExp = extractFloat128Exp( a );
4952
    aSign = extractFloat128Sign( a );
4953
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4954
    shiftCount = 0x402F - aExp;
4955
    if ( shiftCount <= 0 ) {
4956
        if ( 0x403E < aExp ) {
4957
            float_raise( float_flag_invalid STATUS_VAR);
4958
            if (    ! aSign
4959
                 || (    ( aExp == 0x7FFF )
4960
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4961
                    )
4962
               ) {
4963
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4964
            }
4965
            return (int64_t) LIT64( 0x8000000000000000 );
4966
        }
4967
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4968
    }
4969
    else {
4970
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4971
    }
4972
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4973

    
4974
}
4975

    
4976
/*----------------------------------------------------------------------------
4977
| Returns the result of converting the quadruple-precision floating-point
4978
| value `a' to the 64-bit two's complement integer format.  The conversion
4979
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4980