Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 9ee6e8bb

History | View | Annotate | Download (191.2 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
#include "softfloat.h"
34

    
35
/*----------------------------------------------------------------------------
36
| Primitive arithmetic functions, including multi-word arithmetic, and
37
| division and square root approximations.  (Can be specialized to target if
38
| desired.)
39
*----------------------------------------------------------------------------*/
40
#include "softfloat-macros.h"
41

    
42
/*----------------------------------------------------------------------------
43
| Functions and definitions to determine:  (1) whether tininess for underflow
44
| is detected before or after rounding by default, (2) what (if anything)
45
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47
| are propagated from function inputs to output.  These details are target-
48
| specific.
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-specialize.h"
51

    
52
void set_float_rounding_mode(int val STATUS_PARAM)
53
{
54
    STATUS(float_rounding_mode) = val;
55
}
56

    
57
void set_float_exception_flags(int val STATUS_PARAM)
58
{
59
    STATUS(float_exception_flags) = val;
60
}
61

    
62
#ifdef FLOATX80
63
void set_floatx80_rounding_precision(int val STATUS_PARAM)
64
{
65
    STATUS(floatx80_rounding_precision) = val;
66
}
67
#endif
68

    
69
/*----------------------------------------------------------------------------
70
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71
| and 7, and returns the properly rounded 32-bit integer corresponding to the
72
| input.  If `zSign' is 1, the input is negated before being converted to an
73
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
74
| is simply rounded to an integer, with the inexact exception raised if the
75
| input cannot be represented exactly as an integer.  However, if the fixed-
76
| point input is too large, the invalid exception is raised and the largest
77
| positive or negative integer is returned.
78
*----------------------------------------------------------------------------*/
79

    
80
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81
{
82
    int8 roundingMode;
83
    flag roundNearestEven;
84
    int8 roundIncrement, roundBits;
85
    int32 z;
86

    
87
    roundingMode = STATUS(float_rounding_mode);
88
    roundNearestEven = ( roundingMode == float_round_nearest_even );
89
    roundIncrement = 0x40;
90
    if ( ! roundNearestEven ) {
91
        if ( roundingMode == float_round_to_zero ) {
92
            roundIncrement = 0;
93
        }
94
        else {
95
            roundIncrement = 0x7F;
96
            if ( zSign ) {
97
                if ( roundingMode == float_round_up ) roundIncrement = 0;
98
            }
99
            else {
100
                if ( roundingMode == float_round_down ) roundIncrement = 0;
101
            }
102
        }
103
    }
104
    roundBits = absZ & 0x7F;
105
    absZ = ( absZ + roundIncrement )>>7;
106
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107
    z = absZ;
108
    if ( zSign ) z = - z;
109
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110
        float_raise( float_flag_invalid STATUS_VAR);
111
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112
    }
113
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114
    return z;
115

    
116
}
117

    
118
/*----------------------------------------------------------------------------
119
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120
| `absZ1', with binary point between bits 63 and 64 (between the input words),
121
| and returns the properly rounded 64-bit integer corresponding to the input.
122
| If `zSign' is 1, the input is negated before being converted to an integer.
123
| Ordinarily, the fixed-point input is simply rounded to an integer, with
124
| the inexact exception raised if the input cannot be represented exactly as
125
| an integer.  However, if the fixed-point input is too large, the invalid
126
| exception is raised and the largest positive or negative integer is
127
| returned.
128
*----------------------------------------------------------------------------*/
129

    
130
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131
{
132
    int8 roundingMode;
133
    flag roundNearestEven, increment;
134
    int64 z;
135

    
136
    roundingMode = STATUS(float_rounding_mode);
137
    roundNearestEven = ( roundingMode == float_round_nearest_even );
138
    increment = ( (sbits64) absZ1 < 0 );
139
    if ( ! roundNearestEven ) {
140
        if ( roundingMode == float_round_to_zero ) {
141
            increment = 0;
142
        }
143
        else {
144
            if ( zSign ) {
145
                increment = ( roundingMode == float_round_down ) && absZ1;
146
            }
147
            else {
148
                increment = ( roundingMode == float_round_up ) && absZ1;
149
            }
150
        }
151
    }
152
    if ( increment ) {
153
        ++absZ0;
154
        if ( absZ0 == 0 ) goto overflow;
155
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156
    }
157
    z = absZ0;
158
    if ( zSign ) z = - z;
159
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
160
 overflow:
161
        float_raise( float_flag_invalid STATUS_VAR);
162
        return
163
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
164
            : LIT64( 0x7FFFFFFFFFFFFFFF );
165
    }
166
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167
    return z;
168

    
169
}
170

    
171
/*----------------------------------------------------------------------------
172
| Returns the fraction bits of the single-precision floating-point value `a'.
173
*----------------------------------------------------------------------------*/
174

    
175
INLINE bits32 extractFloat32Frac( float32 a )
176
{
177

    
178
    return a & 0x007FFFFF;
179

    
180
}
181

    
182
/*----------------------------------------------------------------------------
183
| Returns the exponent bits of the single-precision floating-point value `a'.
184
*----------------------------------------------------------------------------*/
185

    
186
INLINE int16 extractFloat32Exp( float32 a )
187
{
188

    
189
    return ( a>>23 ) & 0xFF;
190

    
191
}
192

    
193
/*----------------------------------------------------------------------------
194
| Returns the sign bit of the single-precision floating-point value `a'.
195
*----------------------------------------------------------------------------*/
196

    
197
INLINE flag extractFloat32Sign( float32 a )
198
{
199

    
200
    return a>>31;
201

    
202
}
203

    
204
/*----------------------------------------------------------------------------
205
| Normalizes the subnormal single-precision floating-point value represented
206
| by the denormalized significand `aSig'.  The normalized exponent and
207
| significand are stored at the locations pointed to by `zExpPtr' and
208
| `zSigPtr', respectively.
209
*----------------------------------------------------------------------------*/
210

    
211
static void
212
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213
{
214
    int8 shiftCount;
215

    
216
    shiftCount = countLeadingZeros32( aSig ) - 8;
217
    *zSigPtr = aSig<<shiftCount;
218
    *zExpPtr = 1 - shiftCount;
219

    
220
}
221

    
222
/*----------------------------------------------------------------------------
223
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224
| single-precision floating-point value, returning the result.  After being
225
| shifted into the proper positions, the three fields are simply added
226
| together to form the result.  This means that any integer portion of `zSig'
227
| will be added into the exponent.  Since a properly normalized significand
228
| will have an integer portion equal to 1, the `zExp' input should be 1 less
229
| than the desired result exponent whenever `zSig' is a complete, normalized
230
| significand.
231
*----------------------------------------------------------------------------*/
232

    
233
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234
{
235

    
236
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237

    
238
}
239

    
240
/*----------------------------------------------------------------------------
241
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242
| and significand `zSig', and returns the proper single-precision floating-
243
| point value corresponding to the abstract input.  Ordinarily, the abstract
244
| value is simply rounded and packed into the single-precision format, with
245
| the inexact exception raised if the abstract input cannot be represented
246
| exactly.  However, if the abstract value is too large, the overflow and
247
| inexact exceptions are raised and an infinity or maximal finite value is
248
| returned.  If the abstract value is too small, the input value is rounded to
249
| a subnormal number, and the underflow and inexact exceptions are raised if
250
| the abstract input cannot be represented exactly as a subnormal single-
251
| precision floating-point number.
252
|     The input significand `zSig' has its binary point between bits 30
253
| and 29, which is 7 bits to the left of the usual location.  This shifted
254
| significand must be normalized or smaller.  If `zSig' is not normalized,
255
| `zExp' must be 0; in that case, the result returned is a subnormal number,
256
| and it must not require rounding.  In the usual case that `zSig' is
257
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258
| The handling of underflow and overflow follows the IEC/IEEE Standard for
259
| Binary Floating-Point Arithmetic.
260
*----------------------------------------------------------------------------*/
261

    
262
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263
{
264
    int8 roundingMode;
265
    flag roundNearestEven;
266
    int8 roundIncrement, roundBits;
267
    flag isTiny;
268

    
269
    roundingMode = STATUS(float_rounding_mode);
270
    roundNearestEven = ( roundingMode == float_round_nearest_even );
271
    roundIncrement = 0x40;
272
    if ( ! roundNearestEven ) {
273
        if ( roundingMode == float_round_to_zero ) {
274
            roundIncrement = 0;
275
        }
276
        else {
277
            roundIncrement = 0x7F;
278
            if ( zSign ) {
279
                if ( roundingMode == float_round_up ) roundIncrement = 0;
280
            }
281
            else {
282
                if ( roundingMode == float_round_down ) roundIncrement = 0;
283
            }
284
        }
285
    }
286
    roundBits = zSig & 0x7F;
287
    if ( 0xFD <= (bits16) zExp ) {
288
        if (    ( 0xFD < zExp )
289
             || (    ( zExp == 0xFD )
290
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291
           ) {
292
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
293
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294
        }
295
        if ( zExp < 0 ) {
296
            isTiny =
297
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298
                || ( zExp < -1 )
299
                || ( zSig + roundIncrement < 0x80000000 );
300
            shift32RightJamming( zSig, - zExp, &zSig );
301
            zExp = 0;
302
            roundBits = zSig & 0x7F;
303
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304
        }
305
    }
306
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
307
    zSig = ( zSig + roundIncrement )>>7;
308
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309
    if ( zSig == 0 ) zExp = 0;
310
    return packFloat32( zSign, zExp, zSig );
311

    
312
}
313

    
314
/*----------------------------------------------------------------------------
315
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316
| and significand `zSig', and returns the proper single-precision floating-
317
| point value corresponding to the abstract input.  This routine is just like
318
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320
| floating-point exponent.
321
*----------------------------------------------------------------------------*/
322

    
323
static float32
324
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325
{
326
    int8 shiftCount;
327

    
328
    shiftCount = countLeadingZeros32( zSig ) - 1;
329
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330

    
331
}
332

    
333
/*----------------------------------------------------------------------------
334
| Returns the fraction bits of the double-precision floating-point value `a'.
335
*----------------------------------------------------------------------------*/
336

    
337
INLINE bits64 extractFloat64Frac( float64 a )
338
{
339

    
340
    return a & LIT64( 0x000FFFFFFFFFFFFF );
341

    
342
}
343

    
344
/*----------------------------------------------------------------------------
345
| Returns the exponent bits of the double-precision floating-point value `a'.
346
*----------------------------------------------------------------------------*/
347

    
348
INLINE int16 extractFloat64Exp( float64 a )
349
{
350

    
351
    return ( a>>52 ) & 0x7FF;
352

    
353
}
354

    
355
/*----------------------------------------------------------------------------
356
| Returns the sign bit of the double-precision floating-point value `a'.
357
*----------------------------------------------------------------------------*/
358

    
359
INLINE flag extractFloat64Sign( float64 a )
360
{
361

    
362
    return a>>63;
363

    
364
}
365

    
366
/*----------------------------------------------------------------------------
367
| Normalizes the subnormal double-precision floating-point value represented
368
| by the denormalized significand `aSig'.  The normalized exponent and
369
| significand are stored at the locations pointed to by `zExpPtr' and
370
| `zSigPtr', respectively.
371
*----------------------------------------------------------------------------*/
372

    
373
static void
374
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
375
{
376
    int8 shiftCount;
377

    
378
    shiftCount = countLeadingZeros64( aSig ) - 11;
379
    *zSigPtr = aSig<<shiftCount;
380
    *zExpPtr = 1 - shiftCount;
381

    
382
}
383

    
384
/*----------------------------------------------------------------------------
385
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386
| double-precision floating-point value, returning the result.  After being
387
| shifted into the proper positions, the three fields are simply added
388
| together to form the result.  This means that any integer portion of `zSig'
389
| will be added into the exponent.  Since a properly normalized significand
390
| will have an integer portion equal to 1, the `zExp' input should be 1 less
391
| than the desired result exponent whenever `zSig' is a complete, normalized
392
| significand.
393
*----------------------------------------------------------------------------*/
394

    
395
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396
{
397

    
398
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399

    
400
}
401

    
402
/*----------------------------------------------------------------------------
403
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404
| and significand `zSig', and returns the proper double-precision floating-
405
| point value corresponding to the abstract input.  Ordinarily, the abstract
406
| value is simply rounded and packed into the double-precision format, with
407
| the inexact exception raised if the abstract input cannot be represented
408
| exactly.  However, if the abstract value is too large, the overflow and
409
| inexact exceptions are raised and an infinity or maximal finite value is
410
| returned.  If the abstract value is too small, the input value is rounded
411
| to a subnormal number, and the underflow and inexact exceptions are raised
412
| if the abstract input cannot be represented exactly as a subnormal double-
413
| precision floating-point number.
414
|     The input significand `zSig' has its binary point between bits 62
415
| and 61, which is 10 bits to the left of the usual location.  This shifted
416
| significand must be normalized or smaller.  If `zSig' is not normalized,
417
| `zExp' must be 0; in that case, the result returned is a subnormal number,
418
| and it must not require rounding.  In the usual case that `zSig' is
419
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420
| The handling of underflow and overflow follows the IEC/IEEE Standard for
421
| Binary Floating-Point Arithmetic.
422
*----------------------------------------------------------------------------*/
423

    
424
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425
{
426
    int8 roundingMode;
427
    flag roundNearestEven;
428
    int16 roundIncrement, roundBits;
429
    flag isTiny;
430

    
431
    roundingMode = STATUS(float_rounding_mode);
432
    roundNearestEven = ( roundingMode == float_round_nearest_even );
433
    roundIncrement = 0x200;
434
    if ( ! roundNearestEven ) {
435
        if ( roundingMode == float_round_to_zero ) {
436
            roundIncrement = 0;
437
        }
438
        else {
439
            roundIncrement = 0x3FF;
440
            if ( zSign ) {
441
                if ( roundingMode == float_round_up ) roundIncrement = 0;
442
            }
443
            else {
444
                if ( roundingMode == float_round_down ) roundIncrement = 0;
445
            }
446
        }
447
    }
448
    roundBits = zSig & 0x3FF;
449
    if ( 0x7FD <= (bits16) zExp ) {
450
        if (    ( 0x7FD < zExp )
451
             || (    ( zExp == 0x7FD )
452
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453
           ) {
454
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
455
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456
        }
457
        if ( zExp < 0 ) {
458
            isTiny =
459
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460
                || ( zExp < -1 )
461
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462
            shift64RightJamming( zSig, - zExp, &zSig );
463
            zExp = 0;
464
            roundBits = zSig & 0x3FF;
465
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466
        }
467
    }
468
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
469
    zSig = ( zSig + roundIncrement )>>10;
470
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471
    if ( zSig == 0 ) zExp = 0;
472
    return packFloat64( zSign, zExp, zSig );
473

    
474
}
475

    
476
/*----------------------------------------------------------------------------
477
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478
| and significand `zSig', and returns the proper double-precision floating-
479
| point value corresponding to the abstract input.  This routine is just like
480
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482
| floating-point exponent.
483
*----------------------------------------------------------------------------*/
484

    
485
static float64
486
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487
{
488
    int8 shiftCount;
489

    
490
    shiftCount = countLeadingZeros64( zSig ) - 1;
491
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492

    
493
}
494

    
495
#ifdef FLOATX80
496

    
497
/*----------------------------------------------------------------------------
498
| Returns the fraction bits of the extended double-precision floating-point
499
| value `a'.
500
*----------------------------------------------------------------------------*/
501

    
502
INLINE bits64 extractFloatx80Frac( floatx80 a )
503
{
504

    
505
    return a.low;
506

    
507
}
508

    
509
/*----------------------------------------------------------------------------
510
| Returns the exponent bits of the extended double-precision floating-point
511
| value `a'.
512
*----------------------------------------------------------------------------*/
513

    
514
INLINE int32 extractFloatx80Exp( floatx80 a )
515
{
516

    
517
    return a.high & 0x7FFF;
518

    
519
}
520

    
521
/*----------------------------------------------------------------------------
522
| Returns the sign bit of the extended double-precision floating-point value
523
| `a'.
524
*----------------------------------------------------------------------------*/
525

    
526
INLINE flag extractFloatx80Sign( floatx80 a )
527
{
528

    
529
    return a.high>>15;
530

    
531
}
532

    
533
/*----------------------------------------------------------------------------
534
| Normalizes the subnormal extended double-precision floating-point value
535
| represented by the denormalized significand `aSig'.  The normalized exponent
536
| and significand are stored at the locations pointed to by `zExpPtr' and
537
| `zSigPtr', respectively.
538
*----------------------------------------------------------------------------*/
539

    
540
static void
541
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
542
{
543
    int8 shiftCount;
544

    
545
    shiftCount = countLeadingZeros64( aSig );
546
    *zSigPtr = aSig<<shiftCount;
547
    *zExpPtr = 1 - shiftCount;
548

    
549
}
550

    
551
/*----------------------------------------------------------------------------
552
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553
| extended double-precision floating-point value, returning the result.
554
*----------------------------------------------------------------------------*/
555

    
556
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557
{
558
    floatx80 z;
559

    
560
    z.low = zSig;
561
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562
    return z;
563

    
564
}
565

    
566
/*----------------------------------------------------------------------------
567
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569
| and returns the proper extended double-precision floating-point value
570
| corresponding to the abstract input.  Ordinarily, the abstract value is
571
| rounded and packed into the extended double-precision format, with the
572
| inexact exception raised if the abstract input cannot be represented
573
| exactly.  However, if the abstract value is too large, the overflow and
574
| inexact exceptions are raised and an infinity or maximal finite value is
575
| returned.  If the abstract value is too small, the input value is rounded to
576
| a subnormal number, and the underflow and inexact exceptions are raised if
577
| the abstract input cannot be represented exactly as a subnormal extended
578
| double-precision floating-point number.
579
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
580
| number of bits as single or double precision, respectively.  Otherwise, the
581
| result is rounded to the full precision of the extended double-precision
582
| format.
583
|     The input significand must be normalized or smaller.  If the input
584
| significand is not normalized, `zExp' must be 0; in that case, the result
585
| returned is a subnormal number, and it must not require rounding.  The
586
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587
| Floating-Point Arithmetic.
588
*----------------------------------------------------------------------------*/
589

    
590
static floatx80
591
 roundAndPackFloatx80(
592
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593
 STATUS_PARAM)
594
{
595
    int8 roundingMode;
596
    flag roundNearestEven, increment, isTiny;
597
    int64 roundIncrement, roundMask, roundBits;
598

    
599
    roundingMode = STATUS(float_rounding_mode);
600
    roundNearestEven = ( roundingMode == float_round_nearest_even );
601
    if ( roundingPrecision == 80 ) goto precision80;
602
    if ( roundingPrecision == 64 ) {
603
        roundIncrement = LIT64( 0x0000000000000400 );
604
        roundMask = LIT64( 0x00000000000007FF );
605
    }
606
    else if ( roundingPrecision == 32 ) {
607
        roundIncrement = LIT64( 0x0000008000000000 );
608
        roundMask = LIT64( 0x000000FFFFFFFFFF );
609
    }
610
    else {
611
        goto precision80;
612
    }
613
    zSig0 |= ( zSig1 != 0 );
614
    if ( ! roundNearestEven ) {
615
        if ( roundingMode == float_round_to_zero ) {
616
            roundIncrement = 0;
617
        }
618
        else {
619
            roundIncrement = roundMask;
620
            if ( zSign ) {
621
                if ( roundingMode == float_round_up ) roundIncrement = 0;
622
            }
623
            else {
624
                if ( roundingMode == float_round_down ) roundIncrement = 0;
625
            }
626
        }
627
    }
628
    roundBits = zSig0 & roundMask;
629
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630
        if (    ( 0x7FFE < zExp )
631
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632
           ) {
633
            goto overflow;
634
        }
635
        if ( zExp <= 0 ) {
636
            isTiny =
637
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638
                || ( zExp < 0 )
639
                || ( zSig0 <= zSig0 + roundIncrement );
640
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641
            zExp = 0;
642
            roundBits = zSig0 & roundMask;
643
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645
            zSig0 += roundIncrement;
646
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
647
            roundIncrement = roundMask + 1;
648
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649
                roundMask |= roundIncrement;
650
            }
651
            zSig0 &= ~ roundMask;
652
            return packFloatx80( zSign, zExp, zSig0 );
653
        }
654
    }
655
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
656
    zSig0 += roundIncrement;
657
    if ( zSig0 < roundIncrement ) {
658
        ++zExp;
659
        zSig0 = LIT64( 0x8000000000000000 );
660
    }
661
    roundIncrement = roundMask + 1;
662
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663
        roundMask |= roundIncrement;
664
    }
665
    zSig0 &= ~ roundMask;
666
    if ( zSig0 == 0 ) zExp = 0;
667
    return packFloatx80( zSign, zExp, zSig0 );
668
 precision80:
669
    increment = ( (sbits64) zSig1 < 0 );
670
    if ( ! roundNearestEven ) {
671
        if ( roundingMode == float_round_to_zero ) {
672
            increment = 0;
673
        }
674
        else {
675
            if ( zSign ) {
676
                increment = ( roundingMode == float_round_down ) && zSig1;
677
            }
678
            else {
679
                increment = ( roundingMode == float_round_up ) && zSig1;
680
            }
681
        }
682
    }
683
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684
        if (    ( 0x7FFE < zExp )
685
             || (    ( zExp == 0x7FFE )
686
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687
                  && increment
688
                )
689
           ) {
690
            roundMask = 0;
691
 overflow:
692
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
693
            if (    ( roundingMode == float_round_to_zero )
694
                 || ( zSign && ( roundingMode == float_round_up ) )
695
                 || ( ! zSign && ( roundingMode == float_round_down ) )
696
               ) {
697
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698
            }
699
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700
        }
701
        if ( zExp <= 0 ) {
702
            isTiny =
703
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704
                || ( zExp < 0 )
705
                || ! increment
706
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708
            zExp = 0;
709
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
711
            if ( roundNearestEven ) {
712
                increment = ( (sbits64) zSig1 < 0 );
713
            }
714
            else {
715
                if ( zSign ) {
716
                    increment = ( roundingMode == float_round_down ) && zSig1;
717
                }
718
                else {
719
                    increment = ( roundingMode == float_round_up ) && zSig1;
720
                }
721
            }
722
            if ( increment ) {
723
                ++zSig0;
724
                zSig0 &=
725
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
727
            }
728
            return packFloatx80( zSign, zExp, zSig0 );
729
        }
730
    }
731
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
732
    if ( increment ) {
733
        ++zSig0;
734
        if ( zSig0 == 0 ) {
735
            ++zExp;
736
            zSig0 = LIT64( 0x8000000000000000 );
737
        }
738
        else {
739
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740
        }
741
    }
742
    else {
743
        if ( zSig0 == 0 ) zExp = 0;
744
    }
745
    return packFloatx80( zSign, zExp, zSig0 );
746

    
747
}
748

    
749
/*----------------------------------------------------------------------------
750
| Takes an abstract floating-point value having sign `zSign', exponent
751
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752
| and returns the proper extended double-precision floating-point value
753
| corresponding to the abstract input.  This routine is just like
754
| `roundAndPackFloatx80' except that the input significand does not have to be
755
| normalized.
756
*----------------------------------------------------------------------------*/
757

    
758
static floatx80
759
 normalizeRoundAndPackFloatx80(
760
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761
 STATUS_PARAM)
762
{
763
    int8 shiftCount;
764

    
765
    if ( zSig0 == 0 ) {
766
        zSig0 = zSig1;
767
        zSig1 = 0;
768
        zExp -= 64;
769
    }
770
    shiftCount = countLeadingZeros64( zSig0 );
771
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772
    zExp -= shiftCount;
773
    return
774
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775

    
776
}
777

    
778
#endif
779

    
780
#ifdef FLOAT128
781

    
782
/*----------------------------------------------------------------------------
783
| Returns the least-significant 64 fraction bits of the quadruple-precision
784
| floating-point value `a'.
785
*----------------------------------------------------------------------------*/
786

    
787
INLINE bits64 extractFloat128Frac1( float128 a )
788
{
789

    
790
    return a.low;
791

    
792
}
793

    
794
/*----------------------------------------------------------------------------
795
| Returns the most-significant 48 fraction bits of the quadruple-precision
796
| floating-point value `a'.
797
*----------------------------------------------------------------------------*/
798

    
799
INLINE bits64 extractFloat128Frac0( float128 a )
800
{
801

    
802
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803

    
804
}
805

    
806
/*----------------------------------------------------------------------------
807
| Returns the exponent bits of the quadruple-precision floating-point value
808
| `a'.
809
*----------------------------------------------------------------------------*/
810

    
811
INLINE int32 extractFloat128Exp( float128 a )
812
{
813

    
814
    return ( a.high>>48 ) & 0x7FFF;
815

    
816
}
817

    
818
/*----------------------------------------------------------------------------
819
| Returns the sign bit of the quadruple-precision floating-point value `a'.
820
*----------------------------------------------------------------------------*/
821

    
822
INLINE flag extractFloat128Sign( float128 a )
823
{
824

    
825
    return a.high>>63;
826

    
827
}
828

    
829
/*----------------------------------------------------------------------------
830
| Normalizes the subnormal quadruple-precision floating-point value
831
| represented by the denormalized significand formed by the concatenation of
832
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
833
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
834
| significand are stored at the location pointed to by `zSig0Ptr', and the
835
| least significant 64 bits of the normalized significand are stored at the
836
| location pointed to by `zSig1Ptr'.
837
*----------------------------------------------------------------------------*/
838

    
839
static void
840
 normalizeFloat128Subnormal(
841
     bits64 aSig0,
842
     bits64 aSig1,
843
     int32 *zExpPtr,
844
     bits64 *zSig0Ptr,
845
     bits64 *zSig1Ptr
846
 )
847
{
848
    int8 shiftCount;
849

    
850
    if ( aSig0 == 0 ) {
851
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
852
        if ( shiftCount < 0 ) {
853
            *zSig0Ptr = aSig1>>( - shiftCount );
854
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
855
        }
856
        else {
857
            *zSig0Ptr = aSig1<<shiftCount;
858
            *zSig1Ptr = 0;
859
        }
860
        *zExpPtr = - shiftCount - 63;
861
    }
862
    else {
863
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
864
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865
        *zExpPtr = 1 - shiftCount;
866
    }
867

    
868
}
869

    
870
/*----------------------------------------------------------------------------
871
| Packs the sign `zSign', the exponent `zExp', and the significand formed
872
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873
| floating-point value, returning the result.  After being shifted into the
874
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875
| added together to form the most significant 32 bits of the result.  This
876
| means that any integer portion of `zSig0' will be added into the exponent.
877
| Since a properly normalized significand will have an integer portion equal
878
| to 1, the `zExp' input should be 1 less than the desired result exponent
879
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880
| significand.
881
*----------------------------------------------------------------------------*/
882

    
883
INLINE float128
884
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885
{
886
    float128 z;
887

    
888
    z.low = zSig1;
889
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890
    return z;
891

    
892
}
893

    
894
/*----------------------------------------------------------------------------
895
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896
| and extended significand formed by the concatenation of `zSig0', `zSig1',
897
| and `zSig2', and returns the proper quadruple-precision floating-point value
898
| corresponding to the abstract input.  Ordinarily, the abstract value is
899
| simply rounded and packed into the quadruple-precision format, with the
900
| inexact exception raised if the abstract input cannot be represented
901
| exactly.  However, if the abstract value is too large, the overflow and
902
| inexact exceptions are raised and an infinity or maximal finite value is
903
| returned.  If the abstract value is too small, the input value is rounded to
904
| a subnormal number, and the underflow and inexact exceptions are raised if
905
| the abstract input cannot be represented exactly as a subnormal quadruple-
906
| precision floating-point number.
907
|     The input significand must be normalized or smaller.  If the input
908
| significand is not normalized, `zExp' must be 0; in that case, the result
909
| returned is a subnormal number, and it must not require rounding.  In the
910
| usual case that the input significand is normalized, `zExp' must be 1 less
911
| than the ``true'' floating-point exponent.  The handling of underflow and
912
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913
*----------------------------------------------------------------------------*/
914

    
915
static float128
916
 roundAndPackFloat128(
917
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918
{
919
    int8 roundingMode;
920
    flag roundNearestEven, increment, isTiny;
921

    
922
    roundingMode = STATUS(float_rounding_mode);
923
    roundNearestEven = ( roundingMode == float_round_nearest_even );
924
    increment = ( (sbits64) zSig2 < 0 );
925
    if ( ! roundNearestEven ) {
926
        if ( roundingMode == float_round_to_zero ) {
927
            increment = 0;
928
        }
929
        else {
930
            if ( zSign ) {
931
                increment = ( roundingMode == float_round_down ) && zSig2;
932
            }
933
            else {
934
                increment = ( roundingMode == float_round_up ) && zSig2;
935
            }
936
        }
937
    }
938
    if ( 0x7FFD <= (bits32) zExp ) {
939
        if (    ( 0x7FFD < zExp )
940
             || (    ( zExp == 0x7FFD )
941
                  && eq128(
942
                         LIT64( 0x0001FFFFFFFFFFFF ),
943
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
944
                         zSig0,
945
                         zSig1
946
                     )
947
                  && increment
948
                )
949
           ) {
950
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
951
            if (    ( roundingMode == float_round_to_zero )
952
                 || ( zSign && ( roundingMode == float_round_up ) )
953
                 || ( ! zSign && ( roundingMode == float_round_down ) )
954
               ) {
955
                return
956
                    packFloat128(
957
                        zSign,
958
                        0x7FFE,
959
                        LIT64( 0x0000FFFFFFFFFFFF ),
960
                        LIT64( 0xFFFFFFFFFFFFFFFF )
961
                    );
962
            }
963
            return packFloat128( zSign, 0x7FFF, 0, 0 );
964
        }
965
        if ( zExp < 0 ) {
966
            isTiny =
967
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968
                || ( zExp < -1 )
969
                || ! increment
970
                || lt128(
971
                       zSig0,
972
                       zSig1,
973
                       LIT64( 0x0001FFFFFFFFFFFF ),
974
                       LIT64( 0xFFFFFFFFFFFFFFFF )
975
                   );
976
            shift128ExtraRightJamming(
977
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978
            zExp = 0;
979
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980
            if ( roundNearestEven ) {
981
                increment = ( (sbits64) zSig2 < 0 );
982
            }
983
            else {
984
                if ( zSign ) {
985
                    increment = ( roundingMode == float_round_down ) && zSig2;
986
                }
987
                else {
988
                    increment = ( roundingMode == float_round_up ) && zSig2;
989
                }
990
            }
991
        }
992
    }
993
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
994
    if ( increment ) {
995
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997
    }
998
    else {
999
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000
    }
1001
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1002

    
1003
}
1004

    
1005
/*----------------------------------------------------------------------------
1006
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008
| returns the proper quadruple-precision floating-point value corresponding
1009
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1010
| except that the input significand has fewer bits and does not have to be
1011
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1012
| point exponent.
1013
*----------------------------------------------------------------------------*/
1014

    
1015
static float128
1016
 normalizeRoundAndPackFloat128(
1017
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018
{
1019
    int8 shiftCount;
1020
    bits64 zSig2;
1021

    
1022
    if ( zSig0 == 0 ) {
1023
        zSig0 = zSig1;
1024
        zSig1 = 0;
1025
        zExp -= 64;
1026
    }
1027
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028
    if ( 0 <= shiftCount ) {
1029
        zSig2 = 0;
1030
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031
    }
1032
    else {
1033
        shift128ExtraRightJamming(
1034
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035
    }
1036
    zExp -= shiftCount;
1037
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038

    
1039
}
1040

    
1041
#endif
1042

    
1043
/*----------------------------------------------------------------------------
1044
| Returns the result of converting the 32-bit two's complement integer `a'
1045
| to the single-precision floating-point format.  The conversion is performed
1046
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047
*----------------------------------------------------------------------------*/
1048

    
1049
float32 int32_to_float32( int32 a STATUS_PARAM )
1050
{
1051
    flag zSign;
1052

    
1053
    if ( a == 0 ) return 0;
1054
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055
    zSign = ( a < 0 );
1056
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057

    
1058
}
1059

    
1060
/*----------------------------------------------------------------------------
1061
| Returns the result of converting the 32-bit two's complement integer `a'
1062
| to the double-precision floating-point format.  The conversion is performed
1063
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064
*----------------------------------------------------------------------------*/
1065

    
1066
float64 int32_to_float64( int32 a STATUS_PARAM )
1067
{
1068
    flag zSign;
1069
    uint32 absA;
1070
    int8 shiftCount;
1071
    bits64 zSig;
1072

    
1073
    if ( a == 0 ) return 0;
1074
    zSign = ( a < 0 );
1075
    absA = zSign ? - a : a;
1076
    shiftCount = countLeadingZeros32( absA ) + 21;
1077
    zSig = absA;
1078
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079

    
1080
}
1081

    
1082
#ifdef FLOATX80
1083

    
1084
/*----------------------------------------------------------------------------
1085
| Returns the result of converting the 32-bit two's complement integer `a'
1086
| to the extended double-precision floating-point format.  The conversion
1087
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088
| Arithmetic.
1089
*----------------------------------------------------------------------------*/
1090

    
1091
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092
{
1093
    flag zSign;
1094
    uint32 absA;
1095
    int8 shiftCount;
1096
    bits64 zSig;
1097

    
1098
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099
    zSign = ( a < 0 );
1100
    absA = zSign ? - a : a;
1101
    shiftCount = countLeadingZeros32( absA ) + 32;
1102
    zSig = absA;
1103
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104

    
1105
}
1106

    
1107
#endif
1108

    
1109
#ifdef FLOAT128
1110

    
1111
/*----------------------------------------------------------------------------
1112
| Returns the result of converting the 32-bit two's complement integer `a' to
1113
| the quadruple-precision floating-point format.  The conversion is performed
1114
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115
*----------------------------------------------------------------------------*/
1116

    
1117
float128 int32_to_float128( int32 a STATUS_PARAM )
1118
{
1119
    flag zSign;
1120
    uint32 absA;
1121
    int8 shiftCount;
1122
    bits64 zSig0;
1123

    
1124
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125
    zSign = ( a < 0 );
1126
    absA = zSign ? - a : a;
1127
    shiftCount = countLeadingZeros32( absA ) + 17;
1128
    zSig0 = absA;
1129
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130

    
1131
}
1132

    
1133
#endif
1134

    
1135
/*----------------------------------------------------------------------------
1136
| Returns the result of converting the 64-bit two's complement integer `a'
1137
| to the single-precision floating-point format.  The conversion is performed
1138
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139
*----------------------------------------------------------------------------*/
1140

    
1141
float32 int64_to_float32( int64 a STATUS_PARAM )
1142
{
1143
    flag zSign;
1144
    uint64 absA;
1145
    int8 shiftCount;
1146

    
1147
    if ( a == 0 ) return 0;
1148
    zSign = ( a < 0 );
1149
    absA = zSign ? - a : a;
1150
    shiftCount = countLeadingZeros64( absA ) - 40;
1151
    if ( 0 <= shiftCount ) {
1152
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153
    }
1154
    else {
1155
        shiftCount += 7;
1156
        if ( shiftCount < 0 ) {
1157
            shift64RightJamming( absA, - shiftCount, &absA );
1158
        }
1159
        else {
1160
            absA <<= shiftCount;
1161
        }
1162
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163
    }
1164

    
1165
}
1166

    
1167
float32 uint64_to_float32( uint64 a STATUS_PARAM )
1168
{
1169
    int8 shiftCount;
1170

    
1171
    if ( a == 0 ) return 0;
1172
    shiftCount = countLeadingZeros64( a ) - 40;
1173
    if ( 0 <= shiftCount ) {
1174
        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1175
    }
1176
    else {
1177
        shiftCount += 7;
1178
        if ( shiftCount < 0 ) {
1179
            shift64RightJamming( a, - shiftCount, &a );
1180
        }
1181
        else {
1182
            a <<= shiftCount;
1183
        }
1184
        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1185
    }
1186
}
1187

    
1188
/*----------------------------------------------------------------------------
1189
| Returns the result of converting the 64-bit two's complement integer `a'
1190
| to the double-precision floating-point format.  The conversion is performed
1191
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1192
*----------------------------------------------------------------------------*/
1193

    
1194
float64 int64_to_float64( int64 a STATUS_PARAM )
1195
{
1196
    flag zSign;
1197

    
1198
    if ( a == 0 ) return 0;
1199
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1200
        return packFloat64( 1, 0x43E, 0 );
1201
    }
1202
    zSign = ( a < 0 );
1203
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1204

    
1205
}
1206

    
1207
float64 uint64_to_float64( uint64 a STATUS_PARAM )
1208
{
1209
    if ( a == 0 ) return 0;
1210
    return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1211

    
1212
}
1213

    
1214
#ifdef FLOATX80
1215

    
1216
/*----------------------------------------------------------------------------
1217
| Returns the result of converting the 64-bit two's complement integer `a'
1218
| to the extended double-precision floating-point format.  The conversion
1219
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1220
| Arithmetic.
1221
*----------------------------------------------------------------------------*/
1222

    
1223
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1224
{
1225
    flag zSign;
1226
    uint64 absA;
1227
    int8 shiftCount;
1228

    
1229
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1230
    zSign = ( a < 0 );
1231
    absA = zSign ? - a : a;
1232
    shiftCount = countLeadingZeros64( absA );
1233
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1234

    
1235
}
1236

    
1237
#endif
1238

    
1239
#ifdef FLOAT128
1240

    
1241
/*----------------------------------------------------------------------------
1242
| Returns the result of converting the 64-bit two's complement integer `a' to
1243
| the quadruple-precision floating-point format.  The conversion is performed
1244
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1245
*----------------------------------------------------------------------------*/
1246

    
1247
float128 int64_to_float128( int64 a STATUS_PARAM )
1248
{
1249
    flag zSign;
1250
    uint64 absA;
1251
    int8 shiftCount;
1252
    int32 zExp;
1253
    bits64 zSig0, zSig1;
1254

    
1255
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1256
    zSign = ( a < 0 );
1257
    absA = zSign ? - a : a;
1258
    shiftCount = countLeadingZeros64( absA ) + 49;
1259
    zExp = 0x406E - shiftCount;
1260
    if ( 64 <= shiftCount ) {
1261
        zSig1 = 0;
1262
        zSig0 = absA;
1263
        shiftCount -= 64;
1264
    }
1265
    else {
1266
        zSig1 = absA;
1267
        zSig0 = 0;
1268
    }
1269
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1270
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1271

    
1272
}
1273

    
1274
#endif
1275

    
1276
/*----------------------------------------------------------------------------
1277
| Returns the result of converting the single-precision floating-point value
1278
| `a' to the 32-bit two's complement integer format.  The conversion is
1279
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1280
| Arithmetic---which means in particular that the conversion is rounded
1281
| according to the current rounding mode.  If `a' is a NaN, the largest
1282
| positive integer is returned.  Otherwise, if the conversion overflows, the
1283
| largest integer with the same sign as `a' is returned.
1284
*----------------------------------------------------------------------------*/
1285

    
1286
int32 float32_to_int32( float32 a STATUS_PARAM )
1287
{
1288
    flag aSign;
1289
    int16 aExp, shiftCount;
1290
    bits32 aSig;
1291
    bits64 aSig64;
1292

    
1293
    aSig = extractFloat32Frac( a );
1294
    aExp = extractFloat32Exp( a );
1295
    aSign = extractFloat32Sign( a );
1296
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1297
    if ( aExp ) aSig |= 0x00800000;
1298
    shiftCount = 0xAF - aExp;
1299
    aSig64 = aSig;
1300
    aSig64 <<= 32;
1301
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1302
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1303

    
1304
}
1305

    
1306
/*----------------------------------------------------------------------------
1307
| Returns the result of converting the single-precision floating-point value
1308
| `a' to the 32-bit two's complement integer format.  The conversion is
1309
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1310
| Arithmetic, except that the conversion is always rounded toward zero.
1311
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1312
| the conversion overflows, the largest integer with the same sign as `a' is
1313
| returned.
1314
*----------------------------------------------------------------------------*/
1315

    
1316
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1317
{
1318
    flag aSign;
1319
    int16 aExp, shiftCount;
1320
    bits32 aSig;
1321
    int32 z;
1322

    
1323
    aSig = extractFloat32Frac( a );
1324
    aExp = extractFloat32Exp( a );
1325
    aSign = extractFloat32Sign( a );
1326
    shiftCount = aExp - 0x9E;
1327
    if ( 0 <= shiftCount ) {
1328
        if ( a != 0xCF000000 ) {
1329
            float_raise( float_flag_invalid STATUS_VAR);
1330
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1331
        }
1332
        return (sbits32) 0x80000000;
1333
    }
1334
    else if ( aExp <= 0x7E ) {
1335
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1336
        return 0;
1337
    }
1338
    aSig = ( aSig | 0x00800000 )<<8;
1339
    z = aSig>>( - shiftCount );
1340
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1341
        STATUS(float_exception_flags) |= float_flag_inexact;
1342
    }
1343
    if ( aSign ) z = - z;
1344
    return z;
1345

    
1346
}
1347

    
1348
/*----------------------------------------------------------------------------
1349
| Returns the result of converting the single-precision floating-point value
1350
| `a' to the 64-bit two's complement integer format.  The conversion is
1351
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1352
| Arithmetic---which means in particular that the conversion is rounded
1353
| according to the current rounding mode.  If `a' is a NaN, the largest
1354
| positive integer is returned.  Otherwise, if the conversion overflows, the
1355
| largest integer with the same sign as `a' is returned.
1356
*----------------------------------------------------------------------------*/
1357

    
1358
int64 float32_to_int64( float32 a STATUS_PARAM )
1359
{
1360
    flag aSign;
1361
    int16 aExp, shiftCount;
1362
    bits32 aSig;
1363
    bits64 aSig64, aSigExtra;
1364

    
1365
    aSig = extractFloat32Frac( a );
1366
    aExp = extractFloat32Exp( a );
1367
    aSign = extractFloat32Sign( a );
1368
    shiftCount = 0xBE - aExp;
1369
    if ( shiftCount < 0 ) {
1370
        float_raise( float_flag_invalid STATUS_VAR);
1371
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1372
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1373
        }
1374
        return (sbits64) LIT64( 0x8000000000000000 );
1375
    }
1376
    if ( aExp ) aSig |= 0x00800000;
1377
    aSig64 = aSig;
1378
    aSig64 <<= 40;
1379
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1380
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1381

    
1382
}
1383

    
1384
/*----------------------------------------------------------------------------
1385
| Returns the result of converting the single-precision floating-point value
1386
| `a' to the 64-bit two's complement integer format.  The conversion is
1387
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1388
| Arithmetic, except that the conversion is always rounded toward zero.  If
1389
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1390
| conversion overflows, the largest integer with the same sign as `a' is
1391
| returned.
1392
*----------------------------------------------------------------------------*/
1393

    
1394
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1395
{
1396
    flag aSign;
1397
    int16 aExp, shiftCount;
1398
    bits32 aSig;
1399
    bits64 aSig64;
1400
    int64 z;
1401

    
1402
    aSig = extractFloat32Frac( a );
1403
    aExp = extractFloat32Exp( a );
1404
    aSign = extractFloat32Sign( a );
1405
    shiftCount = aExp - 0xBE;
1406
    if ( 0 <= shiftCount ) {
1407
        if ( a != 0xDF000000 ) {
1408
            float_raise( float_flag_invalid STATUS_VAR);
1409
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1410
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1411
            }
1412
        }
1413
        return (sbits64) LIT64( 0x8000000000000000 );
1414
    }
1415
    else if ( aExp <= 0x7E ) {
1416
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1417
        return 0;
1418
    }
1419
    aSig64 = aSig | 0x00800000;
1420
    aSig64 <<= 40;
1421
    z = aSig64>>( - shiftCount );
1422
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1423
        STATUS(float_exception_flags) |= float_flag_inexact;
1424
    }
1425
    if ( aSign ) z = - z;
1426
    return z;
1427

    
1428
}
1429

    
1430
/*----------------------------------------------------------------------------
1431
| Returns the result of converting the single-precision floating-point value
1432
| `a' to the double-precision floating-point format.  The conversion is
1433
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1434
| Arithmetic.
1435
*----------------------------------------------------------------------------*/
1436

    
1437
float64 float32_to_float64( float32 a STATUS_PARAM )
1438
{
1439
    flag aSign;
1440
    int16 aExp;
1441
    bits32 aSig;
1442

    
1443
    aSig = extractFloat32Frac( a );
1444
    aExp = extractFloat32Exp( a );
1445
    aSign = extractFloat32Sign( a );
1446
    if ( aExp == 0xFF ) {
1447
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1448
        return packFloat64( aSign, 0x7FF, 0 );
1449
    }
1450
    if ( aExp == 0 ) {
1451
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1452
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1453
        --aExp;
1454
    }
1455
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1456

    
1457
}
1458

    
1459
#ifdef FLOATX80
1460

    
1461
/*----------------------------------------------------------------------------
1462
| Returns the result of converting the single-precision floating-point value
1463
| `a' to the extended double-precision floating-point format.  The conversion
1464
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1465
| Arithmetic.
1466
*----------------------------------------------------------------------------*/
1467

    
1468
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1469
{
1470
    flag aSign;
1471
    int16 aExp;
1472
    bits32 aSig;
1473

    
1474
    aSig = extractFloat32Frac( a );
1475
    aExp = extractFloat32Exp( a );
1476
    aSign = extractFloat32Sign( a );
1477
    if ( aExp == 0xFF ) {
1478
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1479
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1480
    }
1481
    if ( aExp == 0 ) {
1482
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1483
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1484
    }
1485
    aSig |= 0x00800000;
1486
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1487

    
1488
}
1489

    
1490
#endif
1491

    
1492
#ifdef FLOAT128
1493

    
1494
/*----------------------------------------------------------------------------
1495
| Returns the result of converting the single-precision floating-point value
1496
| `a' to the double-precision floating-point format.  The conversion is
1497
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1498
| Arithmetic.
1499
*----------------------------------------------------------------------------*/
1500

    
1501
float128 float32_to_float128( float32 a STATUS_PARAM )
1502
{
1503
    flag aSign;
1504
    int16 aExp;
1505
    bits32 aSig;
1506

    
1507
    aSig = extractFloat32Frac( a );
1508
    aExp = extractFloat32Exp( a );
1509
    aSign = extractFloat32Sign( a );
1510
    if ( aExp == 0xFF ) {
1511
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1512
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1513
    }
1514
    if ( aExp == 0 ) {
1515
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1516
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1517
        --aExp;
1518
    }
1519
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1520

    
1521
}
1522

    
1523
#endif
1524

    
1525
/*----------------------------------------------------------------------------
1526
| Rounds the single-precision floating-point value `a' to an integer, and
1527
| returns the result as a single-precision floating-point value.  The
1528
| operation is performed according to the IEC/IEEE Standard for Binary
1529
| Floating-Point Arithmetic.
1530
*----------------------------------------------------------------------------*/
1531

    
1532
float32 float32_round_to_int( float32 a STATUS_PARAM)
1533
{
1534
    flag aSign;
1535
    int16 aExp;
1536
    bits32 lastBitMask, roundBitsMask;
1537
    int8 roundingMode;
1538
    float32 z;
1539

    
1540
    aExp = extractFloat32Exp( a );
1541
    if ( 0x96 <= aExp ) {
1542
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1543
            return propagateFloat32NaN( a, a STATUS_VAR );
1544
        }
1545
        return a;
1546
    }
1547
    if ( aExp <= 0x7E ) {
1548
        if ( (bits32) ( a<<1 ) == 0 ) return a;
1549
        STATUS(float_exception_flags) |= float_flag_inexact;
1550
        aSign = extractFloat32Sign( a );
1551
        switch ( STATUS(float_rounding_mode) ) {
1552
         case float_round_nearest_even:
1553
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1554
                return packFloat32( aSign, 0x7F, 0 );
1555
            }
1556
            break;
1557
         case float_round_down:
1558
            return aSign ? 0xBF800000 : 0;
1559
         case float_round_up:
1560
            return aSign ? 0x80000000 : 0x3F800000;
1561
        }
1562
        return packFloat32( aSign, 0, 0 );
1563
    }
1564
    lastBitMask = 1;
1565
    lastBitMask <<= 0x96 - aExp;
1566
    roundBitsMask = lastBitMask - 1;
1567
    z = a;
1568
    roundingMode = STATUS(float_rounding_mode);
1569
    if ( roundingMode == float_round_nearest_even ) {
1570
        z += lastBitMask>>1;
1571
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1572
    }
1573
    else if ( roundingMode != float_round_to_zero ) {
1574
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1575
            z += roundBitsMask;
1576
        }
1577
    }
1578
    z &= ~ roundBitsMask;
1579
    if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1580
    return z;
1581

    
1582
}
1583

    
1584
/*----------------------------------------------------------------------------
1585
| Returns the result of adding the absolute values of the single-precision
1586
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1587
| before being returned.  `zSign' is ignored if the result is a NaN.
1588
| The addition is performed according to the IEC/IEEE Standard for Binary
1589
| Floating-Point Arithmetic.
1590
*----------------------------------------------------------------------------*/
1591

    
1592
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1593
{
1594
    int16 aExp, bExp, zExp;
1595
    bits32 aSig, bSig, zSig;
1596
    int16 expDiff;
1597

    
1598
    aSig = extractFloat32Frac( a );
1599
    aExp = extractFloat32Exp( a );
1600
    bSig = extractFloat32Frac( b );
1601
    bExp = extractFloat32Exp( b );
1602
    expDiff = aExp - bExp;
1603
    aSig <<= 6;
1604
    bSig <<= 6;
1605
    if ( 0 < expDiff ) {
1606
        if ( aExp == 0xFF ) {
1607
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608
            return a;
1609
        }
1610
        if ( bExp == 0 ) {
1611
            --expDiff;
1612
        }
1613
        else {
1614
            bSig |= 0x20000000;
1615
        }
1616
        shift32RightJamming( bSig, expDiff, &bSig );
1617
        zExp = aExp;
1618
    }
1619
    else if ( expDiff < 0 ) {
1620
        if ( bExp == 0xFF ) {
1621
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1622
            return packFloat32( zSign, 0xFF, 0 );
1623
        }
1624
        if ( aExp == 0 ) {
1625
            ++expDiff;
1626
        }
1627
        else {
1628
            aSig |= 0x20000000;
1629
        }
1630
        shift32RightJamming( aSig, - expDiff, &aSig );
1631
        zExp = bExp;
1632
    }
1633
    else {
1634
        if ( aExp == 0xFF ) {
1635
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1636
            return a;
1637
        }
1638
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1639
        zSig = 0x40000000 + aSig + bSig;
1640
        zExp = aExp;
1641
        goto roundAndPack;
1642
    }
1643
    aSig |= 0x20000000;
1644
    zSig = ( aSig + bSig )<<1;
1645
    --zExp;
1646
    if ( (sbits32) zSig < 0 ) {
1647
        zSig = aSig + bSig;
1648
        ++zExp;
1649
    }
1650
 roundAndPack:
1651
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1652

    
1653
}
1654

    
1655
/*----------------------------------------------------------------------------
1656
| Returns the result of subtracting the absolute values of the single-
1657
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1658
| difference is negated before being returned.  `zSign' is ignored if the
1659
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1660
| Standard for Binary Floating-Point Arithmetic.
1661
*----------------------------------------------------------------------------*/
1662

    
1663
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1664
{
1665
    int16 aExp, bExp, zExp;
1666
    bits32 aSig, bSig, zSig;
1667
    int16 expDiff;
1668

    
1669
    aSig = extractFloat32Frac( a );
1670
    aExp = extractFloat32Exp( a );
1671
    bSig = extractFloat32Frac( b );
1672
    bExp = extractFloat32Exp( b );
1673
    expDiff = aExp - bExp;
1674
    aSig <<= 7;
1675
    bSig <<= 7;
1676
    if ( 0 < expDiff ) goto aExpBigger;
1677
    if ( expDiff < 0 ) goto bExpBigger;
1678
    if ( aExp == 0xFF ) {
1679
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1680
        float_raise( float_flag_invalid STATUS_VAR);
1681
        return float32_default_nan;
1682
    }
1683
    if ( aExp == 0 ) {
1684
        aExp = 1;
1685
        bExp = 1;
1686
    }
1687
    if ( bSig < aSig ) goto aBigger;
1688
    if ( aSig < bSig ) goto bBigger;
1689
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1690
 bExpBigger:
1691
    if ( bExp == 0xFF ) {
1692
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1693
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1694
    }
1695
    if ( aExp == 0 ) {
1696
        ++expDiff;
1697
    }
1698
    else {
1699
        aSig |= 0x40000000;
1700
    }
1701
    shift32RightJamming( aSig, - expDiff, &aSig );
1702
    bSig |= 0x40000000;
1703
 bBigger:
1704
    zSig = bSig - aSig;
1705
    zExp = bExp;
1706
    zSign ^= 1;
1707
    goto normalizeRoundAndPack;
1708
 aExpBigger:
1709
    if ( aExp == 0xFF ) {
1710
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1711
        return a;
1712
    }
1713
    if ( bExp == 0 ) {
1714
        --expDiff;
1715
    }
1716
    else {
1717
        bSig |= 0x40000000;
1718
    }
1719
    shift32RightJamming( bSig, expDiff, &bSig );
1720
    aSig |= 0x40000000;
1721
 aBigger:
1722
    zSig = aSig - bSig;
1723
    zExp = aExp;
1724
 normalizeRoundAndPack:
1725
    --zExp;
1726
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1727

    
1728
}
1729

    
1730
/*----------------------------------------------------------------------------
1731
| Returns the result of adding the single-precision floating-point values `a'
1732
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1733
| Binary Floating-Point Arithmetic.
1734
*----------------------------------------------------------------------------*/
1735

    
1736
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1737
{
1738
    flag aSign, bSign;
1739

    
1740
    aSign = extractFloat32Sign( a );
1741
    bSign = extractFloat32Sign( b );
1742
    if ( aSign == bSign ) {
1743
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1744
    }
1745
    else {
1746
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1747
    }
1748

    
1749
}
1750

    
1751
/*----------------------------------------------------------------------------
1752
| Returns the result of subtracting the single-precision floating-point values
1753
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1754
| for Binary Floating-Point Arithmetic.
1755
*----------------------------------------------------------------------------*/
1756

    
1757
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1758
{
1759
    flag aSign, bSign;
1760

    
1761
    aSign = extractFloat32Sign( a );
1762
    bSign = extractFloat32Sign( b );
1763
    if ( aSign == bSign ) {
1764
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1765
    }
1766
    else {
1767
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1768
    }
1769

    
1770
}
1771

    
1772
/*----------------------------------------------------------------------------
1773
| Returns the result of multiplying the single-precision floating-point values
1774
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1775
| for Binary Floating-Point Arithmetic.
1776
*----------------------------------------------------------------------------*/
1777

    
1778
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1779
{
1780
    flag aSign, bSign, zSign;
1781
    int16 aExp, bExp, zExp;
1782
    bits32 aSig, bSig;
1783
    bits64 zSig64;
1784
    bits32 zSig;
1785

    
1786
    aSig = extractFloat32Frac( a );
1787
    aExp = extractFloat32Exp( a );
1788
    aSign = extractFloat32Sign( a );
1789
    bSig = extractFloat32Frac( b );
1790
    bExp = extractFloat32Exp( b );
1791
    bSign = extractFloat32Sign( b );
1792
    zSign = aSign ^ bSign;
1793
    if ( aExp == 0xFF ) {
1794
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1795
            return propagateFloat32NaN( a, b STATUS_VAR );
1796
        }
1797
        if ( ( bExp | bSig ) == 0 ) {
1798
            float_raise( float_flag_invalid STATUS_VAR);
1799
            return float32_default_nan;
1800
        }
1801
        return packFloat32( zSign, 0xFF, 0 );
1802
    }
1803
    if ( bExp == 0xFF ) {
1804
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1805
        if ( ( aExp | aSig ) == 0 ) {
1806
            float_raise( float_flag_invalid STATUS_VAR);
1807
            return float32_default_nan;
1808
        }
1809
        return packFloat32( zSign, 0xFF, 0 );
1810
    }
1811
    if ( aExp == 0 ) {
1812
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1813
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1814
    }
1815
    if ( bExp == 0 ) {
1816
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1817
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1818
    }
1819
    zExp = aExp + bExp - 0x7F;
1820
    aSig = ( aSig | 0x00800000 )<<7;
1821
    bSig = ( bSig | 0x00800000 )<<8;
1822
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1823
    zSig = zSig64;
1824
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1825
        zSig <<= 1;
1826
        --zExp;
1827
    }
1828
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1829

    
1830
}
1831

    
1832
/*----------------------------------------------------------------------------
1833
| Returns the result of dividing the single-precision floating-point value `a'
1834
| by the corresponding value `b'.  The operation is performed according to the
1835
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1836
*----------------------------------------------------------------------------*/
1837

    
1838
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1839
{
1840
    flag aSign, bSign, zSign;
1841
    int16 aExp, bExp, zExp;
1842
    bits32 aSig, bSig, zSig;
1843

    
1844
    aSig = extractFloat32Frac( a );
1845
    aExp = extractFloat32Exp( a );
1846
    aSign = extractFloat32Sign( a );
1847
    bSig = extractFloat32Frac( b );
1848
    bExp = extractFloat32Exp( b );
1849
    bSign = extractFloat32Sign( b );
1850
    zSign = aSign ^ bSign;
1851
    if ( aExp == 0xFF ) {
1852
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1853
        if ( bExp == 0xFF ) {
1854
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1855
            float_raise( float_flag_invalid STATUS_VAR);
1856
            return float32_default_nan;
1857
        }
1858
        return packFloat32( zSign, 0xFF, 0 );
1859
    }
1860
    if ( bExp == 0xFF ) {
1861
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1862
        return packFloat32( zSign, 0, 0 );
1863
    }
1864
    if ( bExp == 0 ) {
1865
        if ( bSig == 0 ) {
1866
            if ( ( aExp | aSig ) == 0 ) {
1867
                float_raise( float_flag_invalid STATUS_VAR);
1868
                return float32_default_nan;
1869
            }
1870
            float_raise( float_flag_divbyzero STATUS_VAR);
1871
            return packFloat32( zSign, 0xFF, 0 );
1872
        }
1873
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1874
    }
1875
    if ( aExp == 0 ) {
1876
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1877
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1878
    }
1879
    zExp = aExp - bExp + 0x7D;
1880
    aSig = ( aSig | 0x00800000 )<<7;
1881
    bSig = ( bSig | 0x00800000 )<<8;
1882
    if ( bSig <= ( aSig + aSig ) ) {
1883
        aSig >>= 1;
1884
        ++zExp;
1885
    }
1886
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1887
    if ( ( zSig & 0x3F ) == 0 ) {
1888
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1889
    }
1890
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1891

    
1892
}
1893

    
1894
/*----------------------------------------------------------------------------
1895
| Returns the remainder of the single-precision floating-point value `a'
1896
| with respect to the corresponding value `b'.  The operation is performed
1897
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1898
*----------------------------------------------------------------------------*/
1899

    
1900
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1901
{
1902
    flag aSign, bSign, zSign;
1903
    int16 aExp, bExp, expDiff;
1904
    bits32 aSig, bSig;
1905
    bits32 q;
1906
    bits64 aSig64, bSig64, q64;
1907
    bits32 alternateASig;
1908
    sbits32 sigMean;
1909

    
1910
    aSig = extractFloat32Frac( a );
1911
    aExp = extractFloat32Exp( a );
1912
    aSign = extractFloat32Sign( a );
1913
    bSig = extractFloat32Frac( b );
1914
    bExp = extractFloat32Exp( b );
1915
    bSign = extractFloat32Sign( b );
1916
    if ( aExp == 0xFF ) {
1917
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1918
            return propagateFloat32NaN( a, b STATUS_VAR );
1919
        }
1920
        float_raise( float_flag_invalid STATUS_VAR);
1921
        return float32_default_nan;
1922
    }
1923
    if ( bExp == 0xFF ) {
1924
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1925
        return a;
1926
    }
1927
    if ( bExp == 0 ) {
1928
        if ( bSig == 0 ) {
1929
            float_raise( float_flag_invalid STATUS_VAR);
1930
            return float32_default_nan;
1931
        }
1932
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1933
    }
1934
    if ( aExp == 0 ) {
1935
        if ( aSig == 0 ) return a;
1936
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937
    }
1938
    expDiff = aExp - bExp;
1939
    aSig |= 0x00800000;
1940
    bSig |= 0x00800000;
1941
    if ( expDiff < 32 ) {
1942
        aSig <<= 8;
1943
        bSig <<= 8;
1944
        if ( expDiff < 0 ) {
1945
            if ( expDiff < -1 ) return a;
1946
            aSig >>= 1;
1947
        }
1948
        q = ( bSig <= aSig );
1949
        if ( q ) aSig -= bSig;
1950
        if ( 0 < expDiff ) {
1951
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1952
            q >>= 32 - expDiff;
1953
            bSig >>= 2;
1954
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1955
        }
1956
        else {
1957
            aSig >>= 2;
1958
            bSig >>= 2;
1959
        }
1960
    }
1961
    else {
1962
        if ( bSig <= aSig ) aSig -= bSig;
1963
        aSig64 = ( (bits64) aSig )<<40;
1964
        bSig64 = ( (bits64) bSig )<<40;
1965
        expDiff -= 64;
1966
        while ( 0 < expDiff ) {
1967
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1968
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1969
            aSig64 = - ( ( bSig * q64 )<<38 );
1970
            expDiff -= 62;
1971
        }
1972
        expDiff += 64;
1973
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1974
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1975
        q = q64>>( 64 - expDiff );
1976
        bSig <<= 6;
1977
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1978
    }
1979
    do {
1980
        alternateASig = aSig;
1981
        ++q;
1982
        aSig -= bSig;
1983
    } while ( 0 <= (sbits32) aSig );
1984
    sigMean = aSig + alternateASig;
1985
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1986
        aSig = alternateASig;
1987
    }
1988
    zSign = ( (sbits32) aSig < 0 );
1989
    if ( zSign ) aSig = - aSig;
1990
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1991

    
1992
}
1993

    
1994
/*----------------------------------------------------------------------------
1995
| Returns the square root of the single-precision floating-point value `a'.
1996
| The operation is performed according to the IEC/IEEE Standard for Binary
1997
| Floating-Point Arithmetic.
1998
*----------------------------------------------------------------------------*/
1999

    
2000
float32 float32_sqrt( float32 a STATUS_PARAM )
2001
{
2002
    flag aSign;
2003
    int16 aExp, zExp;
2004
    bits32 aSig, zSig;
2005
    bits64 rem, term;
2006

    
2007
    aSig = extractFloat32Frac( a );
2008
    aExp = extractFloat32Exp( a );
2009
    aSign = extractFloat32Sign( a );
2010
    if ( aExp == 0xFF ) {
2011
        if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
2012
        if ( ! aSign ) return a;
2013
        float_raise( float_flag_invalid STATUS_VAR);
2014
        return float32_default_nan;
2015
    }
2016
    if ( aSign ) {
2017
        if ( ( aExp | aSig ) == 0 ) return a;
2018
        float_raise( float_flag_invalid STATUS_VAR);
2019
        return float32_default_nan;
2020
    }
2021
    if ( aExp == 0 ) {
2022
        if ( aSig == 0 ) return 0;
2023
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2024
    }
2025
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2026
    aSig = ( aSig | 0x00800000 )<<8;
2027
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2028
    if ( ( zSig & 0x7F ) <= 5 ) {
2029
        if ( zSig < 2 ) {
2030
            zSig = 0x7FFFFFFF;
2031
            goto roundAndPack;
2032
        }
2033
        aSig >>= aExp & 1;
2034
        term = ( (bits64) zSig ) * zSig;
2035
        rem = ( ( (bits64) aSig )<<32 ) - term;
2036
        while ( (sbits64) rem < 0 ) {
2037
            --zSig;
2038
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2039
        }
2040
        zSig |= ( rem != 0 );
2041
    }
2042
    shift32RightJamming( zSig, 1, &zSig );
2043
 roundAndPack:
2044
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2045

    
2046
}
2047

    
2048
/*----------------------------------------------------------------------------
2049
| Returns 1 if the single-precision floating-point value `a' is equal to
2050
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2051
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052
*----------------------------------------------------------------------------*/
2053

    
2054
int float32_eq( float32 a, float32 b STATUS_PARAM )
2055
{
2056

    
2057
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2058
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2059
       ) {
2060
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2061
            float_raise( float_flag_invalid STATUS_VAR);
2062
        }
2063
        return 0;
2064
    }
2065
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2066

    
2067
}
2068

    
2069
/*----------------------------------------------------------------------------
2070
| Returns 1 if the single-precision floating-point value `a' is less than
2071
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2072
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2073
| Arithmetic.
2074
*----------------------------------------------------------------------------*/
2075

    
2076
int float32_le( float32 a, float32 b STATUS_PARAM )
2077
{
2078
    flag aSign, bSign;
2079

    
2080
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2081
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2082
       ) {
2083
        float_raise( float_flag_invalid STATUS_VAR);
2084
        return 0;
2085
    }
2086
    aSign = extractFloat32Sign( a );
2087
    bSign = extractFloat32Sign( b );
2088
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2089
    return ( a == b ) || ( aSign ^ ( a < b ) );
2090

    
2091
}
2092

    
2093
/*----------------------------------------------------------------------------
2094
| Returns 1 if the single-precision floating-point value `a' is less than
2095
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2096
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2097
*----------------------------------------------------------------------------*/
2098

    
2099
int float32_lt( float32 a, float32 b STATUS_PARAM )
2100
{
2101
    flag aSign, bSign;
2102

    
2103
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2104
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2105
       ) {
2106
        float_raise( float_flag_invalid STATUS_VAR);
2107
        return 0;
2108
    }
2109
    aSign = extractFloat32Sign( a );
2110
    bSign = extractFloat32Sign( b );
2111
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2112
    return ( a != b ) && ( aSign ^ ( a < b ) );
2113

    
2114
}
2115

    
2116
/*----------------------------------------------------------------------------
2117
| Returns 1 if the single-precision floating-point value `a' is equal to
2118
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2119
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2120
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2121
*----------------------------------------------------------------------------*/
2122

    
2123
int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2124
{
2125

    
2126
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2127
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2128
       ) {
2129
        float_raise( float_flag_invalid STATUS_VAR);
2130
        return 0;
2131
    }
2132
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2133

    
2134
}
2135

    
2136
/*----------------------------------------------------------------------------
2137
| Returns 1 if the single-precision floating-point value `a' is less than or
2138
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2139
| cause an exception.  Otherwise, the comparison is performed according to the
2140
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2141
*----------------------------------------------------------------------------*/
2142

    
2143
int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2144
{
2145
    flag aSign, bSign;
2146

    
2147
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2148
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2149
       ) {
2150
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2151
            float_raise( float_flag_invalid STATUS_VAR);
2152
        }
2153
        return 0;
2154
    }
2155
    aSign = extractFloat32Sign( a );
2156
    bSign = extractFloat32Sign( b );
2157
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2158
    return ( a == b ) || ( aSign ^ ( a < b ) );
2159

    
2160
}
2161

    
2162
/*----------------------------------------------------------------------------
2163
| Returns 1 if the single-precision floating-point value `a' is less than
2164
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2165
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2166
| Standard for Binary Floating-Point Arithmetic.
2167
*----------------------------------------------------------------------------*/
2168

    
2169
int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2170
{
2171
    flag aSign, bSign;
2172

    
2173
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2174
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2175
       ) {
2176
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2177
            float_raise( float_flag_invalid STATUS_VAR);
2178
        }
2179
        return 0;
2180
    }
2181
    aSign = extractFloat32Sign( a );
2182
    bSign = extractFloat32Sign( b );
2183
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2184
    return ( a != b ) && ( aSign ^ ( a < b ) );
2185

    
2186
}
2187

    
2188
/*----------------------------------------------------------------------------
2189
| Returns the result of converting the double-precision floating-point value
2190
| `a' to the 32-bit two's complement integer format.  The conversion is
2191
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2192
| Arithmetic---which means in particular that the conversion is rounded
2193
| according to the current rounding mode.  If `a' is a NaN, the largest
2194
| positive integer is returned.  Otherwise, if the conversion overflows, the
2195
| largest integer with the same sign as `a' is returned.
2196
*----------------------------------------------------------------------------*/
2197

    
2198
int32 float64_to_int32( float64 a STATUS_PARAM )
2199
{
2200
    flag aSign;
2201
    int16 aExp, shiftCount;
2202
    bits64 aSig;
2203

    
2204
    aSig = extractFloat64Frac( a );
2205
    aExp = extractFloat64Exp( a );
2206
    aSign = extractFloat64Sign( a );
2207
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2208
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2209
    shiftCount = 0x42C - aExp;
2210
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2211
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2212

    
2213
}
2214

    
2215
/*----------------------------------------------------------------------------
2216
| Returns the result of converting the double-precision floating-point value
2217
| `a' to the 32-bit two's complement integer format.  The conversion is
2218
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2219
| Arithmetic, except that the conversion is always rounded toward zero.
2220
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2221
| the conversion overflows, the largest integer with the same sign as `a' is
2222
| returned.
2223
*----------------------------------------------------------------------------*/
2224

    
2225
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2226
{
2227
    flag aSign;
2228
    int16 aExp, shiftCount;
2229
    bits64 aSig, savedASig;
2230
    int32 z;
2231

    
2232
    aSig = extractFloat64Frac( a );
2233
    aExp = extractFloat64Exp( a );
2234
    aSign = extractFloat64Sign( a );
2235
    if ( 0x41E < aExp ) {
2236
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2237
        goto invalid;
2238
    }
2239
    else if ( aExp < 0x3FF ) {
2240
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2241
        return 0;
2242
    }
2243
    aSig |= LIT64( 0x0010000000000000 );
2244
    shiftCount = 0x433 - aExp;
2245
    savedASig = aSig;
2246
    aSig >>= shiftCount;
2247
    z = aSig;
2248
    if ( aSign ) z = - z;
2249
    if ( ( z < 0 ) ^ aSign ) {
2250
 invalid:
2251
        float_raise( float_flag_invalid STATUS_VAR);
2252
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2253
    }
2254
    if ( ( aSig<<shiftCount ) != savedASig ) {
2255
        STATUS(float_exception_flags) |= float_flag_inexact;
2256
    }
2257
    return z;
2258

    
2259
}
2260

    
2261
/*----------------------------------------------------------------------------
2262
| Returns the result of converting the double-precision floating-point value
2263
| `a' to the 64-bit two's complement integer format.  The conversion is
2264
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2265
| Arithmetic---which means in particular that the conversion is rounded
2266
| according to the current rounding mode.  If `a' is a NaN, the largest
2267
| positive integer is returned.  Otherwise, if the conversion overflows, the
2268
| largest integer with the same sign as `a' is returned.
2269
*----------------------------------------------------------------------------*/
2270

    
2271
int64 float64_to_int64( float64 a STATUS_PARAM )
2272
{
2273
    flag aSign;
2274
    int16 aExp, shiftCount;
2275
    bits64 aSig, aSigExtra;
2276

    
2277
    aSig = extractFloat64Frac( a );
2278
    aExp = extractFloat64Exp( a );
2279
    aSign = extractFloat64Sign( a );
2280
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2281
    shiftCount = 0x433 - aExp;
2282
    if ( shiftCount <= 0 ) {
2283
        if ( 0x43E < aExp ) {
2284
            float_raise( float_flag_invalid STATUS_VAR);
2285
            if (    ! aSign
2286
                 || (    ( aExp == 0x7FF )
2287
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2288
               ) {
2289
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2290
            }
2291
            return (sbits64) LIT64( 0x8000000000000000 );
2292
        }
2293
        aSigExtra = 0;
2294
        aSig <<= - shiftCount;
2295
    }
2296
    else {
2297
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2298
    }
2299
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2300

    
2301
}
2302

    
2303
/*----------------------------------------------------------------------------
2304
| Returns the result of converting the double-precision floating-point value
2305
| `a' to the 64-bit two's complement integer format.  The conversion is
2306
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2307
| Arithmetic, except that the conversion is always rounded toward zero.
2308
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2309
| the conversion overflows, the largest integer with the same sign as `a' is
2310
| returned.
2311
*----------------------------------------------------------------------------*/
2312

    
2313
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2314
{
2315
    flag aSign;
2316
    int16 aExp, shiftCount;
2317
    bits64 aSig;
2318
    int64 z;
2319

    
2320
    aSig = extractFloat64Frac( a );
2321
    aExp = extractFloat64Exp( a );
2322
    aSign = extractFloat64Sign( a );
2323
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2324
    shiftCount = aExp - 0x433;
2325
    if ( 0 <= shiftCount ) {
2326
        if ( 0x43E <= aExp ) {
2327
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2328
                float_raise( float_flag_invalid STATUS_VAR);
2329
                if (    ! aSign
2330
                     || (    ( aExp == 0x7FF )
2331
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2332
                   ) {
2333
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2334
                }
2335
            }
2336
            return (sbits64) LIT64( 0x8000000000000000 );
2337
        }
2338
        z = aSig<<shiftCount;
2339
    }
2340
    else {
2341
        if ( aExp < 0x3FE ) {
2342
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2343
            return 0;
2344
        }
2345
        z = aSig>>( - shiftCount );
2346
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2347
            STATUS(float_exception_flags) |= float_flag_inexact;
2348
        }
2349
    }
2350
    if ( aSign ) z = - z;
2351
    return z;
2352

    
2353
}
2354

    
2355
/*----------------------------------------------------------------------------
2356
| Returns the result of converting the double-precision floating-point value
2357
| `a' to the single-precision floating-point format.  The conversion is
2358
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2359
| Arithmetic.
2360
*----------------------------------------------------------------------------*/
2361

    
2362
float32 float64_to_float32( float64 a STATUS_PARAM )
2363
{
2364
    flag aSign;
2365
    int16 aExp;
2366
    bits64 aSig;
2367
    bits32 zSig;
2368

    
2369
    aSig = extractFloat64Frac( a );
2370
    aExp = extractFloat64Exp( a );
2371
    aSign = extractFloat64Sign( a );
2372
    if ( aExp == 0x7FF ) {
2373
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2374
        return packFloat32( aSign, 0xFF, 0 );
2375
    }
2376
    shift64RightJamming( aSig, 22, &aSig );
2377
    zSig = aSig;
2378
    if ( aExp || zSig ) {
2379
        zSig |= 0x40000000;
2380
        aExp -= 0x381;
2381
    }
2382
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2383

    
2384
}
2385

    
2386
#ifdef FLOATX80
2387

    
2388
/*----------------------------------------------------------------------------
2389
| Returns the result of converting the double-precision floating-point value
2390
| `a' to the extended double-precision floating-point format.  The conversion
2391
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2392
| Arithmetic.
2393
*----------------------------------------------------------------------------*/
2394

    
2395
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2396
{
2397
    flag aSign;
2398
    int16 aExp;
2399
    bits64 aSig;
2400

    
2401
    aSig = extractFloat64Frac( a );
2402
    aExp = extractFloat64Exp( a );
2403
    aSign = extractFloat64Sign( a );
2404
    if ( aExp == 0x7FF ) {
2405
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2406
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2407
    }
2408
    if ( aExp == 0 ) {
2409
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2410
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2411
    }
2412
    return
2413
        packFloatx80(
2414
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2415

    
2416
}
2417

    
2418
#endif
2419

    
2420
#ifdef FLOAT128
2421

    
2422
/*----------------------------------------------------------------------------
2423
| Returns the result of converting the double-precision floating-point value
2424
| `a' to the quadruple-precision floating-point format.  The conversion is
2425
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2426
| Arithmetic.
2427
*----------------------------------------------------------------------------*/
2428

    
2429
float128 float64_to_float128( float64 a STATUS_PARAM )
2430
{
2431
    flag aSign;
2432
    int16 aExp;
2433
    bits64 aSig, zSig0, zSig1;
2434

    
2435
    aSig = extractFloat64Frac( a );
2436
    aExp = extractFloat64Exp( a );
2437
    aSign = extractFloat64Sign( a );
2438
    if ( aExp == 0x7FF ) {
2439
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2440
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2441
    }
2442
    if ( aExp == 0 ) {
2443
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2444
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2445
        --aExp;
2446
    }
2447
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2448
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2449

    
2450
}
2451

    
2452
#endif
2453

    
2454
/*----------------------------------------------------------------------------
2455
| Rounds the double-precision floating-point value `a' to an integer, and
2456
| returns the result as a double-precision floating-point value.  The
2457
| operation is performed according to the IEC/IEEE Standard for Binary
2458
| Floating-Point Arithmetic.
2459
*----------------------------------------------------------------------------*/
2460

    
2461
float64 float64_round_to_int( float64 a STATUS_PARAM )
2462
{
2463
    flag aSign;
2464
    int16 aExp;
2465
    bits64 lastBitMask, roundBitsMask;
2466
    int8 roundingMode;
2467
    float64 z;
2468

    
2469
    aExp = extractFloat64Exp( a );
2470
    if ( 0x433 <= aExp ) {
2471
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2472
            return propagateFloat64NaN( a, a STATUS_VAR );
2473
        }
2474
        return a;
2475
    }
2476
    if ( aExp < 0x3FF ) {
2477
        if ( (bits64) ( a<<1 ) == 0 ) return a;
2478
        STATUS(float_exception_flags) |= float_flag_inexact;
2479
        aSign = extractFloat64Sign( a );
2480
        switch ( STATUS(float_rounding_mode) ) {
2481
         case float_round_nearest_even:
2482
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2483
                return packFloat64( aSign, 0x3FF, 0 );
2484
            }
2485
            break;
2486
         case float_round_down:
2487
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2488
         case float_round_up:
2489
            return
2490
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2491
        }
2492
        return packFloat64( aSign, 0, 0 );
2493
    }
2494
    lastBitMask = 1;
2495
    lastBitMask <<= 0x433 - aExp;
2496
    roundBitsMask = lastBitMask - 1;
2497
    z = a;
2498
    roundingMode = STATUS(float_rounding_mode);
2499
    if ( roundingMode == float_round_nearest_even ) {
2500
        z += lastBitMask>>1;
2501
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2502
    }
2503
    else if ( roundingMode != float_round_to_zero ) {
2504
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2505
            z += roundBitsMask;
2506
        }
2507
    }
2508
    z &= ~ roundBitsMask;
2509
    if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2510
    return z;
2511

    
2512
}
2513

    
2514
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2515
{
2516
    int oldmode;
2517
    float64 res;
2518
    oldmode = STATUS(float_rounding_mode);
2519
    STATUS(float_rounding_mode) = float_round_to_zero;
2520
    res = float64_round_to_int(a STATUS_VAR);
2521
    STATUS(float_rounding_mode) = oldmode;
2522
    return res;
2523
}
2524

    
2525
/*----------------------------------------------------------------------------
2526
| Returns the result of adding the absolute values of the double-precision
2527
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2528
| before being returned.  `zSign' is ignored if the result is a NaN.
2529
| The addition is performed according to the IEC/IEEE Standard for Binary
2530
| Floating-Point Arithmetic.
2531
*----------------------------------------------------------------------------*/
2532

    
2533
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2534
{
2535
    int16 aExp, bExp, zExp;
2536
    bits64 aSig, bSig, zSig;
2537
    int16 expDiff;
2538

    
2539
    aSig = extractFloat64Frac( a );
2540
    aExp = extractFloat64Exp( a );
2541
    bSig = extractFloat64Frac( b );
2542
    bExp = extractFloat64Exp( b );
2543
    expDiff = aExp - bExp;
2544
    aSig <<= 9;
2545
    bSig <<= 9;
2546
    if ( 0 < expDiff ) {
2547
        if ( aExp == 0x7FF ) {
2548
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549
            return a;
2550
        }
2551
        if ( bExp == 0 ) {
2552
            --expDiff;
2553
        }
2554
        else {
2555
            bSig |= LIT64( 0x2000000000000000 );
2556
        }
2557
        shift64RightJamming( bSig, expDiff, &bSig );
2558
        zExp = aExp;
2559
    }
2560
    else if ( expDiff < 0 ) {
2561
        if ( bExp == 0x7FF ) {
2562
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2563
            return packFloat64( zSign, 0x7FF, 0 );
2564
        }
2565
        if ( aExp == 0 ) {
2566
            ++expDiff;
2567
        }
2568
        else {
2569
            aSig |= LIT64( 0x2000000000000000 );
2570
        }
2571
        shift64RightJamming( aSig, - expDiff, &aSig );
2572
        zExp = bExp;
2573
    }
2574
    else {
2575
        if ( aExp == 0x7FF ) {
2576
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577
            return a;
2578
        }
2579
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2580
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2581
        zExp = aExp;
2582
        goto roundAndPack;
2583
    }
2584
    aSig |= LIT64( 0x2000000000000000 );
2585
    zSig = ( aSig + bSig )<<1;
2586
    --zExp;
2587
    if ( (sbits64) zSig < 0 ) {
2588
        zSig = aSig + bSig;
2589
        ++zExp;
2590
    }
2591
 roundAndPack:
2592
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2593

    
2594
}
2595

    
2596
/*----------------------------------------------------------------------------
2597
| Returns the result of subtracting the absolute values of the double-
2598
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2599
| difference is negated before being returned.  `zSign' is ignored if the
2600
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2601
| Standard for Binary Floating-Point Arithmetic.
2602
*----------------------------------------------------------------------------*/
2603

    
2604
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2605
{
2606
    int16 aExp, bExp, zExp;
2607
    bits64 aSig, bSig, zSig;
2608
    int16 expDiff;
2609

    
2610
    aSig = extractFloat64Frac( a );
2611
    aExp = extractFloat64Exp( a );
2612
    bSig = extractFloat64Frac( b );
2613
    bExp = extractFloat64Exp( b );
2614
    expDiff = aExp - bExp;
2615
    aSig <<= 10;
2616
    bSig <<= 10;
2617
    if ( 0 < expDiff ) goto aExpBigger;
2618
    if ( expDiff < 0 ) goto bExpBigger;
2619
    if ( aExp == 0x7FF ) {
2620
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2621
        float_raise( float_flag_invalid STATUS_VAR);
2622
        return float64_default_nan;
2623
    }
2624
    if ( aExp == 0 ) {
2625
        aExp = 1;
2626
        bExp = 1;
2627
    }
2628
    if ( bSig < aSig ) goto aBigger;
2629
    if ( aSig < bSig ) goto bBigger;
2630
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2631
 bExpBigger:
2632
    if ( bExp == 0x7FF ) {
2633
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2634
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2635
    }
2636
    if ( aExp == 0 ) {
2637
        ++expDiff;
2638
    }
2639
    else {
2640
        aSig |= LIT64( 0x4000000000000000 );
2641
    }
2642
    shift64RightJamming( aSig, - expDiff, &aSig );
2643
    bSig |= LIT64( 0x4000000000000000 );
2644
 bBigger:
2645
    zSig = bSig - aSig;
2646
    zExp = bExp;
2647
    zSign ^= 1;
2648
    goto normalizeRoundAndPack;
2649
 aExpBigger:
2650
    if ( aExp == 0x7FF ) {
2651
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652
        return a;
2653
    }
2654
    if ( bExp == 0 ) {
2655
        --expDiff;
2656
    }
2657
    else {
2658
        bSig |= LIT64( 0x4000000000000000 );
2659
    }
2660
    shift64RightJamming( bSig, expDiff, &bSig );
2661
    aSig |= LIT64( 0x4000000000000000 );
2662
 aBigger:
2663
    zSig = aSig - bSig;
2664
    zExp = aExp;
2665
 normalizeRoundAndPack:
2666
    --zExp;
2667
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2668

    
2669
}
2670

    
2671
/*----------------------------------------------------------------------------
2672
| Returns the result of adding the double-precision floating-point values `a'
2673
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2674
| Binary Floating-Point Arithmetic.
2675
*----------------------------------------------------------------------------*/
2676

    
2677
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2678
{
2679
    flag aSign, bSign;
2680

    
2681
    aSign = extractFloat64Sign( a );
2682
    bSign = extractFloat64Sign( b );
2683
    if ( aSign == bSign ) {
2684
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2685
    }
2686
    else {
2687
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2688
    }
2689

    
2690
}
2691

    
2692
/*----------------------------------------------------------------------------
2693
| Returns the result of subtracting the double-precision floating-point values
2694
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2695
| for Binary Floating-Point Arithmetic.
2696
*----------------------------------------------------------------------------*/
2697

    
2698
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2699
{
2700
    flag aSign, bSign;
2701

    
2702
    aSign = extractFloat64Sign( a );
2703
    bSign = extractFloat64Sign( b );
2704
    if ( aSign == bSign ) {
2705
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2706
    }
2707
    else {
2708
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2709
    }
2710

    
2711
}
2712

    
2713
/*----------------------------------------------------------------------------
2714
| Returns the result of multiplying the double-precision floating-point values
2715
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2716
| for Binary Floating-Point Arithmetic.
2717
*----------------------------------------------------------------------------*/
2718

    
2719
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2720
{
2721
    flag aSign, bSign, zSign;
2722
    int16 aExp, bExp, zExp;
2723
    bits64 aSig, bSig, zSig0, zSig1;
2724

    
2725
    aSig = extractFloat64Frac( a );
2726
    aExp = extractFloat64Exp( a );
2727
    aSign = extractFloat64Sign( a );
2728
    bSig = extractFloat64Frac( b );
2729
    bExp = extractFloat64Exp( b );
2730
    bSign = extractFloat64Sign( b );
2731
    zSign = aSign ^ bSign;
2732
    if ( aExp == 0x7FF ) {
2733
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2734
            return propagateFloat64NaN( a, b STATUS_VAR );
2735
        }
2736
        if ( ( bExp | bSig ) == 0 ) {
2737
            float_raise( float_flag_invalid STATUS_VAR);
2738
            return float64_default_nan;
2739
        }
2740
        return packFloat64( zSign, 0x7FF, 0 );
2741
    }
2742
    if ( bExp == 0x7FF ) {
2743
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2744
        if ( ( aExp | aSig ) == 0 ) {
2745
            float_raise( float_flag_invalid STATUS_VAR);
2746
            return float64_default_nan;
2747
        }
2748
        return packFloat64( zSign, 0x7FF, 0 );
2749
    }
2750
    if ( aExp == 0 ) {
2751
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2752
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2753
    }
2754
    if ( bExp == 0 ) {
2755
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2756
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2757
    }
2758
    zExp = aExp + bExp - 0x3FF;
2759
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2760
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2761
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2762
    zSig0 |= ( zSig1 != 0 );
2763
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2764
        zSig0 <<= 1;
2765
        --zExp;
2766
    }
2767
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2768

    
2769
}
2770

    
2771
/*----------------------------------------------------------------------------
2772
| Returns the result of dividing the double-precision floating-point value `a'
2773
| by the corresponding value `b'.  The operation is performed according to
2774
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2775
*----------------------------------------------------------------------------*/
2776

    
2777
float64 float64_div( float64 a, float64 b STATUS_PARAM )
2778
{
2779
    flag aSign, bSign, zSign;
2780
    int16 aExp, bExp, zExp;
2781
    bits64 aSig, bSig, zSig;
2782
    bits64 rem0, rem1;
2783
    bits64 term0, term1;
2784

    
2785
    aSig = extractFloat64Frac( a );
2786
    aExp = extractFloat64Exp( a );
2787
    aSign = extractFloat64Sign( a );
2788
    bSig = extractFloat64Frac( b );
2789
    bExp = extractFloat64Exp( b );
2790
    bSign = extractFloat64Sign( b );
2791
    zSign = aSign ^ bSign;
2792
    if ( aExp == 0x7FF ) {
2793
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2794
        if ( bExp == 0x7FF ) {
2795
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2796
            float_raise( float_flag_invalid STATUS_VAR);
2797
            return float64_default_nan;
2798
        }
2799
        return packFloat64( zSign, 0x7FF, 0 );
2800
    }
2801
    if ( bExp == 0x7FF ) {
2802
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2803
        return packFloat64( zSign, 0, 0 );
2804
    }
2805
    if ( bExp == 0 ) {
2806
        if ( bSig == 0 ) {
2807
            if ( ( aExp | aSig ) == 0 ) {
2808
                float_raise( float_flag_invalid STATUS_VAR);
2809
                return float64_default_nan;
2810
            }
2811
            float_raise( float_flag_divbyzero STATUS_VAR);
2812
            return packFloat64( zSign, 0x7FF, 0 );
2813
        }
2814
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2815
    }
2816
    if ( aExp == 0 ) {
2817
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2818
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2819
    }
2820
    zExp = aExp - bExp + 0x3FD;
2821
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2822
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2823
    if ( bSig <= ( aSig + aSig ) ) {
2824
        aSig >>= 1;
2825
        ++zExp;
2826
    }
2827
    zSig = estimateDiv128To64( aSig, 0, bSig );
2828
    if ( ( zSig & 0x1FF ) <= 2 ) {
2829
        mul64To128( bSig, zSig, &term0, &term1 );
2830
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2831
        while ( (sbits64) rem0 < 0 ) {
2832
            --zSig;
2833
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2834
        }
2835
        zSig |= ( rem1 != 0 );
2836
    }
2837
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2838

    
2839
}
2840

    
2841
/*----------------------------------------------------------------------------
2842
| Returns the remainder of the double-precision floating-point value `a'
2843
| with respect to the corresponding value `b'.  The operation is performed
2844
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2845
*----------------------------------------------------------------------------*/
2846

    
2847
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2848
{
2849
    flag aSign, bSign, zSign;
2850
    int16 aExp, bExp, expDiff;
2851
    bits64 aSig, bSig;
2852
    bits64 q, alternateASig;
2853
    sbits64 sigMean;
2854

    
2855
    aSig = extractFloat64Frac( a );
2856
    aExp = extractFloat64Exp( a );
2857
    aSign = extractFloat64Sign( a );
2858
    bSig = extractFloat64Frac( b );
2859
    bExp = extractFloat64Exp( b );
2860
    bSign = extractFloat64Sign( b );
2861
    if ( aExp == 0x7FF ) {
2862
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2863
            return propagateFloat64NaN( a, b STATUS_VAR );
2864
        }
2865
        float_raise( float_flag_invalid STATUS_VAR);
2866
        return float64_default_nan;
2867
    }
2868
    if ( bExp == 0x7FF ) {
2869
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2870
        return a;
2871
    }
2872
    if ( bExp == 0 ) {
2873
        if ( bSig == 0 ) {
2874
            float_raise( float_flag_invalid STATUS_VAR);
2875
            return float64_default_nan;
2876
        }
2877
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2878
    }
2879
    if ( aExp == 0 ) {
2880
        if ( aSig == 0 ) return a;
2881
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2882
    }
2883
    expDiff = aExp - bExp;
2884
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2885
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2886
    if ( expDiff < 0 ) {
2887
        if ( expDiff < -1 ) return a;
2888
        aSig >>= 1;
2889
    }
2890
    q = ( bSig <= aSig );
2891
    if ( q ) aSig -= bSig;
2892
    expDiff -= 64;
2893
    while ( 0 < expDiff ) {
2894
        q = estimateDiv128To64( aSig, 0, bSig );
2895
        q = ( 2 < q ) ? q - 2 : 0;
2896
        aSig = - ( ( bSig>>2 ) * q );
2897
        expDiff -= 62;
2898
    }
2899
    expDiff += 64;
2900
    if ( 0 < expDiff ) {
2901
        q = estimateDiv128To64( aSig, 0, bSig );
2902
        q = ( 2 < q ) ? q - 2 : 0;
2903
        q >>= 64 - expDiff;
2904
        bSig >>= 2;
2905
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2906
    }
2907
    else {
2908
        aSig >>= 2;
2909
        bSig >>= 2;
2910
    }
2911
    do {
2912
        alternateASig = aSig;
2913
        ++q;
2914
        aSig -= bSig;
2915
    } while ( 0 <= (sbits64) aSig );
2916
    sigMean = aSig + alternateASig;
2917
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2918
        aSig = alternateASig;
2919
    }
2920
    zSign = ( (sbits64) aSig < 0 );
2921
    if ( zSign ) aSig = - aSig;
2922
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2923

    
2924
}
2925

    
2926
/*----------------------------------------------------------------------------
2927
| Returns the square root of the double-precision floating-point value `a'.
2928
| The operation is performed according to the IEC/IEEE Standard for Binary
2929
| Floating-Point Arithmetic.
2930
*----------------------------------------------------------------------------*/
2931

    
2932
float64 float64_sqrt( float64 a STATUS_PARAM )
2933
{
2934
    flag aSign;
2935
    int16 aExp, zExp;
2936
    bits64 aSig, zSig, doubleZSig;
2937
    bits64 rem0, rem1, term0, term1;
2938

    
2939
    aSig = extractFloat64Frac( a );
2940
    aExp = extractFloat64Exp( a );
2941
    aSign = extractFloat64Sign( a );
2942
    if ( aExp == 0x7FF ) {
2943
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2944
        if ( ! aSign ) return a;
2945
        float_raise( float_flag_invalid STATUS_VAR);
2946
        return float64_default_nan;
2947
    }
2948
    if ( aSign ) {
2949
        if ( ( aExp | aSig ) == 0 ) return a;
2950
        float_raise( float_flag_invalid STATUS_VAR);
2951
        return float64_default_nan;
2952
    }
2953
    if ( aExp == 0 ) {
2954
        if ( aSig == 0 ) return 0;
2955
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2956
    }
2957
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2958
    aSig |= LIT64( 0x0010000000000000 );
2959
    zSig = estimateSqrt32( aExp, aSig>>21 );
2960
    aSig <<= 9 - ( aExp & 1 );
2961
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2962
    if ( ( zSig & 0x1FF ) <= 5 ) {
2963
        doubleZSig = zSig<<1;
2964
        mul64To128( zSig, zSig, &term0, &term1 );
2965
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2966
        while ( (sbits64) rem0 < 0 ) {
2967
            --zSig;
2968
            doubleZSig -= 2;
2969
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2970
        }
2971
        zSig |= ( ( rem0 | rem1 ) != 0 );
2972
    }
2973
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2974

    
2975
}
2976

    
2977
/*----------------------------------------------------------------------------
2978
| Returns 1 if the double-precision floating-point value `a' is equal to the
2979
| corresponding value `b', and 0 otherwise.  The comparison is performed
2980
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2981
*----------------------------------------------------------------------------*/
2982

    
2983
int float64_eq( float64 a, float64 b STATUS_PARAM )
2984
{
2985

    
2986
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2987
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2988
       ) {
2989
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2990
            float_raise( float_flag_invalid STATUS_VAR);
2991
        }
2992
        return 0;
2993
    }
2994
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2995

    
2996
}
2997

    
2998
/*----------------------------------------------------------------------------
2999
| Returns 1 if the double-precision floating-point value `a' is less than or
3000
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3001
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3002
| Arithmetic.
3003
*----------------------------------------------------------------------------*/
3004

    
3005
int float64_le( float64 a, float64 b STATUS_PARAM )
3006
{
3007
    flag aSign, bSign;
3008

    
3009
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3010
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3011
       ) {
3012
        float_raise( float_flag_invalid STATUS_VAR);
3013
        return 0;
3014
    }
3015
    aSign = extractFloat64Sign( a );
3016
    bSign = extractFloat64Sign( b );
3017
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3018
    return ( a == b ) || ( aSign ^ ( a < b ) );
3019

    
3020
}
3021

    
3022
/*----------------------------------------------------------------------------
3023
| Returns 1 if the double-precision floating-point value `a' is less than
3024
| the corresponding value `b', and 0 otherwise.  The comparison is performed
3025
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3026
*----------------------------------------------------------------------------*/
3027

    
3028
int float64_lt( float64 a, float64 b STATUS_PARAM )
3029
{
3030
    flag aSign, bSign;
3031

    
3032
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3033
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3034
       ) {
3035
        float_raise( float_flag_invalid STATUS_VAR);
3036
        return 0;
3037
    }
3038
    aSign = extractFloat64Sign( a );
3039
    bSign = extractFloat64Sign( b );
3040
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3041
    return ( a != b ) && ( aSign ^ ( a < b ) );
3042

    
3043
}
3044

    
3045
/*----------------------------------------------------------------------------
3046
| Returns 1 if the double-precision floating-point value `a' is equal to the
3047
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3048
| if either operand is a NaN.  Otherwise, the comparison is performed
3049
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3050
*----------------------------------------------------------------------------*/
3051

    
3052
int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3053
{
3054

    
3055
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3056
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3057
       ) {
3058
        float_raise( float_flag_invalid STATUS_VAR);
3059
        return 0;
3060
    }
3061
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3062

    
3063
}
3064

    
3065
/*----------------------------------------------------------------------------
3066
| Returns 1 if the double-precision floating-point value `a' is less than or
3067
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3068
| cause an exception.  Otherwise, the comparison is performed according to the
3069
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3070
*----------------------------------------------------------------------------*/
3071

    
3072
int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3073
{
3074
    flag aSign, bSign;
3075

    
3076
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3077
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3078
       ) {
3079
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3080
            float_raise( float_flag_invalid STATUS_VAR);
3081
        }
3082
        return 0;
3083
    }
3084
    aSign = extractFloat64Sign( a );
3085
    bSign = extractFloat64Sign( b );
3086
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3087
    return ( a == b ) || ( aSign ^ ( a < b ) );
3088

    
3089
}
3090

    
3091
/*----------------------------------------------------------------------------
3092
| Returns 1 if the double-precision floating-point value `a' is less than
3093
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3094
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3095
| Standard for Binary Floating-Point Arithmetic.
3096
*----------------------------------------------------------------------------*/
3097

    
3098
int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3099
{
3100
    flag aSign, bSign;
3101

    
3102
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3103
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3104
       ) {
3105
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3106
            float_raise( float_flag_invalid STATUS_VAR);
3107
        }
3108
        return 0;
3109
    }
3110
    aSign = extractFloat64Sign( a );
3111
    bSign = extractFloat64Sign( b );
3112
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3113
    return ( a != b ) && ( aSign ^ ( a < b ) );
3114

    
3115
}
3116

    
3117
#ifdef FLOATX80
3118

    
3119
/*----------------------------------------------------------------------------
3120
| Returns the result of converting the extended double-precision floating-
3121
| point value `a' to the 32-bit two's complement integer format.  The
3122
| conversion is performed according to the IEC/IEEE Standard for Binary
3123
| Floating-Point Arithmetic---which means in particular that the conversion
3124
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3125
| largest positive integer is returned.  Otherwise, if the conversion
3126
| overflows, the largest integer with the same sign as `a' is returned.
3127
*----------------------------------------------------------------------------*/
3128

    
3129
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3130
{
3131
    flag aSign;
3132
    int32 aExp, shiftCount;
3133
    bits64 aSig;
3134

    
3135
    aSig = extractFloatx80Frac( a );
3136
    aExp = extractFloatx80Exp( a );
3137
    aSign = extractFloatx80Sign( a );
3138
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3139
    shiftCount = 0x4037 - aExp;
3140
    if ( shiftCount <= 0 ) shiftCount = 1;
3141
    shift64RightJamming( aSig, shiftCount, &aSig );
3142
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3143

    
3144
}
3145

    
3146
/*----------------------------------------------------------------------------
3147
| Returns the result of converting the extended double-precision floating-
3148
| point value `a' to the 32-bit two's complement integer format.  The
3149
| conversion is performed according to the IEC/IEEE Standard for Binary
3150
| Floating-Point Arithmetic, except that the conversion is always rounded
3151
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3152
| Otherwise, if the conversion overflows, the largest integer with the same
3153
| sign as `a' is returned.
3154
*----------------------------------------------------------------------------*/
3155

    
3156
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3157
{
3158
    flag aSign;
3159
    int32 aExp, shiftCount;
3160
    bits64 aSig, savedASig;
3161
    int32 z;
3162

    
3163
    aSig = extractFloatx80Frac( a );
3164
    aExp = extractFloatx80Exp( a );
3165
    aSign = extractFloatx80Sign( a );
3166
    if ( 0x401E < aExp ) {
3167
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3168
        goto invalid;
3169
    }
3170
    else if ( aExp < 0x3FFF ) {
3171
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3172
        return 0;
3173
    }
3174
    shiftCount = 0x403E - aExp;
3175
    savedASig = aSig;
3176
    aSig >>= shiftCount;
3177
    z = aSig;
3178
    if ( aSign ) z = - z;
3179
    if ( ( z < 0 ) ^ aSign ) {
3180
 invalid:
3181
        float_raise( float_flag_invalid STATUS_VAR);
3182
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3183
    }
3184
    if ( ( aSig<<shiftCount ) != savedASig ) {
3185
        STATUS(float_exception_flags) |= float_flag_inexact;
3186
    }
3187
    return z;
3188

    
3189
}
3190

    
3191
/*----------------------------------------------------------------------------
3192
| Returns the result of converting the extended double-precision floating-
3193
| point value `a' to the 64-bit two's complement integer format.  The
3194
| conversion is performed according to the IEC/IEEE Standard for Binary
3195
| Floating-Point Arithmetic---which means in particular that the conversion
3196
| is rounded according to the current rounding mode.  If `a' is a NaN,
3197
| the largest positive integer is returned.  Otherwise, if the conversion
3198
| overflows, the largest integer with the same sign as `a' is returned.
3199
*----------------------------------------------------------------------------*/
3200

    
3201
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3202
{
3203
    flag aSign;
3204
    int32 aExp, shiftCount;
3205
    bits64 aSig, aSigExtra;
3206

    
3207
    aSig = extractFloatx80Frac( a );
3208
    aExp = extractFloatx80Exp( a );
3209
    aSign = extractFloatx80Sign( a );
3210
    shiftCount = 0x403E - aExp;
3211
    if ( shiftCount <= 0 ) {
3212
        if ( shiftCount ) {
3213
            float_raise( float_flag_invalid STATUS_VAR);
3214
            if (    ! aSign
3215
                 || (    ( aExp == 0x7FFF )
3216
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3217
               ) {
3218
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3219
            }
3220
            return (sbits64) LIT64( 0x8000000000000000 );
3221
        }
3222
        aSigExtra = 0;
3223
    }
3224
    else {
3225
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3226
    }
3227
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3228

    
3229
}
3230

    
3231
/*----------------------------------------------------------------------------
3232
| Returns the result of converting the extended double-precision floating-
3233
| point value `a' to the 64-bit two's complement integer format.  The
3234
| conversion is performed according to the IEC/IEEE Standard for Binary
3235
| Floating-Point Arithmetic, except that the conversion is always rounded
3236
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3237
| Otherwise, if the conversion overflows, the largest integer with the same
3238
| sign as `a' is returned.
3239
*----------------------------------------------------------------------------*/
3240

    
3241
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3242
{
3243
    flag aSign;
3244
    int32 aExp, shiftCount;
3245
    bits64 aSig;
3246
    int64 z;
3247

    
3248
    aSig = extractFloatx80Frac( a );
3249
    aExp = extractFloatx80Exp( a );
3250
    aSign = extractFloatx80Sign( a );
3251
    shiftCount = aExp - 0x403E;
3252
    if ( 0 <= shiftCount ) {
3253
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3254
        if ( ( a.high != 0xC03E ) || aSig ) {
3255
            float_raise( float_flag_invalid STATUS_VAR);
3256
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3257
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3258
            }
3259
        }
3260
        return (sbits64) LIT64( 0x8000000000000000 );
3261
    }
3262
    else if ( aExp < 0x3FFF ) {
3263
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3264
        return 0;
3265
    }
3266
    z = aSig>>( - shiftCount );
3267
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3268
        STATUS(float_exception_flags) |= float_flag_inexact;
3269
    }
3270
    if ( aSign ) z = - z;
3271
    return z;
3272

    
3273
}
3274

    
3275
/*----------------------------------------------------------------------------
3276
| Returns the result of converting the extended double-precision floating-
3277
| point value `a' to the single-precision floating-point format.  The
3278
| conversion is performed according to the IEC/IEEE Standard for Binary
3279
| Floating-Point Arithmetic.
3280
*----------------------------------------------------------------------------*/
3281

    
3282
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3283
{
3284
    flag aSign;
3285
    int32 aExp;
3286
    bits64 aSig;
3287

    
3288
    aSig = extractFloatx80Frac( a );
3289
    aExp = extractFloatx80Exp( a );
3290
    aSign = extractFloatx80Sign( a );
3291
    if ( aExp == 0x7FFF ) {
3292
        if ( (bits64) ( aSig<<1 ) ) {
3293
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3294
        }
3295
        return packFloat32( aSign, 0xFF, 0 );
3296
    }
3297
    shift64RightJamming( aSig, 33, &aSig );
3298
    if ( aExp || aSig ) aExp -= 0x3F81;
3299
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3300

    
3301
}
3302

    
3303
/*----------------------------------------------------------------------------
3304
| Returns the result of converting the extended double-precision floating-
3305
| point value `a' to the double-precision floating-point format.  The
3306
| conversion is performed according to the IEC/IEEE Standard for Binary
3307
| Floating-Point Arithmetic.
3308
*----------------------------------------------------------------------------*/
3309

    
3310
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3311
{
3312
    flag aSign;
3313
    int32 aExp;
3314
    bits64 aSig, zSig;
3315

    
3316
    aSig = extractFloatx80Frac( a );
3317
    aExp = extractFloatx80Exp( a );
3318
    aSign = extractFloatx80Sign( a );
3319
    if ( aExp == 0x7FFF ) {
3320
        if ( (bits64) ( aSig<<1 ) ) {
3321
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3322
        }
3323
        return packFloat64( aSign, 0x7FF, 0 );
3324
    }
3325
    shift64RightJamming( aSig, 1, &zSig );
3326
    if ( aExp || aSig ) aExp -= 0x3C01;
3327
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3328

    
3329
}
3330

    
3331
#ifdef FLOAT128
3332

    
3333
/*----------------------------------------------------------------------------
3334
| Returns the result of converting the extended double-precision floating-
3335
| point value `a' to the quadruple-precision floating-point format.  The
3336
| conversion is performed according to the IEC/IEEE Standard for Binary
3337
| Floating-Point Arithmetic.
3338
*----------------------------------------------------------------------------*/
3339

    
3340
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3341
{
3342
    flag aSign;
3343
    int16 aExp;
3344
    bits64 aSig, zSig0, zSig1;
3345

    
3346
    aSig = extractFloatx80Frac( a );
3347
    aExp = extractFloatx80Exp( a );
3348
    aSign = extractFloatx80Sign( a );
3349
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3350
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3351
    }
3352
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3353
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3354

    
3355
}
3356

    
3357
#endif
3358

    
3359
/*----------------------------------------------------------------------------
3360
| Rounds the extended double-precision floating-point value `a' to an integer,
3361
| and returns the result as an extended quadruple-precision floating-point
3362
| value.  The operation is performed according to the IEC/IEEE Standard for
3363
| Binary Floating-Point Arithmetic.
3364
*----------------------------------------------------------------------------*/
3365

    
3366
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3367
{
3368
    flag aSign;
3369
    int32 aExp;
3370
    bits64 lastBitMask, roundBitsMask;
3371
    int8 roundingMode;
3372
    floatx80 z;
3373

    
3374
    aExp = extractFloatx80Exp( a );
3375
    if ( 0x403E <= aExp ) {
3376
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3377
            return propagateFloatx80NaN( a, a STATUS_VAR );
3378
        }
3379
        return a;
3380
    }
3381
    if ( aExp < 0x3FFF ) {
3382
        if (    ( aExp == 0 )
3383
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3384
            return a;
3385
        }
3386
        STATUS(float_exception_flags) |= float_flag_inexact;
3387
        aSign = extractFloatx80Sign( a );
3388
        switch ( STATUS(float_rounding_mode) ) {
3389
         case float_round_nearest_even:
3390
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3391
               ) {
3392
                return
3393
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3394
            }
3395
            break;
3396
         case float_round_down:
3397
            return
3398
                  aSign ?
3399
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3400
                : packFloatx80( 0, 0, 0 );
3401
         case float_round_up:
3402
            return
3403
                  aSign ? packFloatx80( 1, 0, 0 )
3404
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3405
        }
3406
        return packFloatx80( aSign, 0, 0 );
3407
    }
3408
    lastBitMask = 1;
3409
    lastBitMask <<= 0x403E - aExp;
3410
    roundBitsMask = lastBitMask - 1;
3411
    z = a;
3412
    roundingMode = STATUS(float_rounding_mode);
3413
    if ( roundingMode == float_round_nearest_even ) {
3414
        z.low += lastBitMask>>1;
3415
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3416
    }
3417
    else if ( roundingMode != float_round_to_zero ) {
3418
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3419
            z.low += roundBitsMask;
3420
        }
3421
    }
3422
    z.low &= ~ roundBitsMask;
3423
    if ( z.low == 0 ) {
3424
        ++z.high;
3425
        z.low = LIT64( 0x8000000000000000 );
3426
    }
3427
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3428
    return z;
3429

    
3430
}
3431

    
3432
/*----------------------------------------------------------------------------
3433
| Returns the result of adding the absolute values of the extended double-
3434
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3435
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3436
| The addition is performed according to the IEC/IEEE Standard for Binary
3437
| Floating-Point Arithmetic.
3438
*----------------------------------------------------------------------------*/
3439

    
3440
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3441
{
3442
    int32 aExp, bExp, zExp;
3443
    bits64 aSig, bSig, zSig0, zSig1;
3444
    int32 expDiff;
3445

    
3446
    aSig = extractFloatx80Frac( a );
3447
    aExp = extractFloatx80Exp( a );
3448
    bSig = extractFloatx80Frac( b );
3449
    bExp = extractFloatx80Exp( b );
3450
    expDiff = aExp - bExp;
3451
    if ( 0 < expDiff ) {
3452
        if ( aExp == 0x7FFF ) {
3453
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3454
            return a;
3455
        }
3456
        if ( bExp == 0 ) --expDiff;
3457
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3458
        zExp = aExp;
3459
    }
3460
    else if ( expDiff < 0 ) {
3461
        if ( bExp == 0x7FFF ) {
3462
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3463
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3464
        }
3465
        if ( aExp == 0 ) ++expDiff;
3466
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3467
        zExp = bExp;
3468
    }
3469
    else {
3470
        if ( aExp == 0x7FFF ) {
3471
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3472
                return propagateFloatx80NaN( a, b STATUS_VAR );
3473
            }
3474
            return a;
3475
        }
3476
        zSig1 = 0;
3477
        zSig0 = aSig + bSig;
3478
        if ( aExp == 0 ) {
3479
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3480
            goto roundAndPack;
3481
        }
3482
        zExp = aExp;
3483
        goto shiftRight1;
3484
    }
3485
    zSig0 = aSig + bSig;
3486
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3487
 shiftRight1:
3488
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3489
    zSig0 |= LIT64( 0x8000000000000000 );
3490
    ++zExp;
3491
 roundAndPack:
3492
    return
3493
        roundAndPackFloatx80(
3494
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3495

    
3496
}
3497

    
3498
/*----------------------------------------------------------------------------
3499
| Returns the result of subtracting the absolute values of the extended
3500
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3501
| difference is negated before being returned.  `zSign' is ignored if the
3502
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3503
| Standard for Binary Floating-Point Arithmetic.
3504
*----------------------------------------------------------------------------*/
3505

    
3506
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3507
{
3508
    int32 aExp, bExp, zExp;
3509
    bits64 aSig, bSig, zSig0, zSig1;
3510
    int32 expDiff;
3511
    floatx80 z;
3512

    
3513
    aSig = extractFloatx80Frac( a );
3514
    aExp = extractFloatx80Exp( a );
3515
    bSig = extractFloatx80Frac( b );
3516
    bExp = extractFloatx80Exp( b );
3517
    expDiff = aExp - bExp;
3518
    if ( 0 < expDiff ) goto aExpBigger;
3519
    if ( expDiff < 0 ) goto bExpBigger;
3520
    if ( aExp == 0x7FFF ) {
3521
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3522
            return propagateFloatx80NaN( a, b STATUS_VAR );
3523
        }
3524
        float_raise( float_flag_invalid STATUS_VAR);
3525
        z.low = floatx80_default_nan_low;
3526
        z.high = floatx80_default_nan_high;
3527
        return z;
3528
    }
3529
    if ( aExp == 0 ) {
3530
        aExp = 1;
3531
        bExp = 1;
3532
    }
3533
    zSig1 = 0;
3534
    if ( bSig < aSig ) goto aBigger;
3535
    if ( aSig < bSig ) goto bBigger;
3536
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3537
 bExpBigger:
3538
    if ( bExp == 0x7FFF ) {
3539
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3540
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3541
    }
3542
    if ( aExp == 0 ) ++expDiff;
3543
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3544
 bBigger:
3545
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3546
    zExp = bExp;
3547
    zSign ^= 1;
3548
    goto normalizeRoundAndPack;
3549
 aExpBigger:
3550
    if ( aExp == 0x7FFF ) {
3551
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3552
        return a;
3553
    }
3554
    if ( bExp == 0 ) --expDiff;
3555
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3556
 aBigger:
3557
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3558
    zExp = aExp;
3559
 normalizeRoundAndPack:
3560
    return
3561
        normalizeRoundAndPackFloatx80(
3562
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3563

    
3564
}
3565

    
3566
/*----------------------------------------------------------------------------
3567
| Returns the result of adding the extended double-precision floating-point
3568
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3569
| Standard for Binary Floating-Point Arithmetic.
3570
*----------------------------------------------------------------------------*/
3571

    
3572
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3573
{
3574
    flag aSign, bSign;
3575

    
3576
    aSign = extractFloatx80Sign( a );
3577
    bSign = extractFloatx80Sign( b );
3578
    if ( aSign == bSign ) {
3579
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3580
    }
3581
    else {
3582
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3583
    }
3584

    
3585
}
3586

    
3587
/*----------------------------------------------------------------------------
3588
| Returns the result of subtracting the extended double-precision floating-
3589
| point values `a' and `b'.  The operation is performed according to the
3590
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591
*----------------------------------------------------------------------------*/
3592

    
3593
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3594
{
3595
    flag aSign, bSign;
3596

    
3597
    aSign = extractFloatx80Sign( a );
3598
    bSign = extractFloatx80Sign( b );
3599
    if ( aSign == bSign ) {
3600
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3601
    }
3602
    else {
3603
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3604
    }
3605

    
3606
}
3607

    
3608
/*----------------------------------------------------------------------------
3609
| Returns the result of multiplying the extended double-precision floating-
3610
| point values `a' and `b'.  The operation is performed according to the
3611
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3612
*----------------------------------------------------------------------------*/
3613

    
3614
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3615
{
3616
    flag aSign, bSign, zSign;
3617
    int32 aExp, bExp, zExp;
3618
    bits64 aSig, bSig, zSig0, zSig1;
3619
    floatx80 z;
3620

    
3621
    aSig = extractFloatx80Frac( a );
3622
    aExp = extractFloatx80Exp( a );
3623
    aSign = extractFloatx80Sign( a );
3624
    bSig = extractFloatx80Frac( b );
3625
    bExp = extractFloatx80Exp( b );
3626
    bSign = extractFloatx80Sign( b );
3627
    zSign = aSign ^ bSign;
3628
    if ( aExp == 0x7FFF ) {
3629
        if (    (bits64) ( aSig<<1 )
3630
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3631
            return propagateFloatx80NaN( a, b STATUS_VAR );
3632
        }
3633
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3634
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3635
    }
3636
    if ( bExp == 0x7FFF ) {
3637
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3638
        if ( ( aExp | aSig ) == 0 ) {
3639
 invalid:
3640
            float_raise( float_flag_invalid STATUS_VAR);
3641
            z.low = floatx80_default_nan_low;
3642
            z.high = floatx80_default_nan_high;
3643
            return z;
3644
        }
3645
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3646
    }
3647
    if ( aExp == 0 ) {
3648
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3649
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3650
    }
3651
    if ( bExp == 0 ) {
3652
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3653
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3654
    }
3655
    zExp = aExp + bExp - 0x3FFE;
3656
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3657
    if ( 0 < (sbits64) zSig0 ) {
3658
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3659
        --zExp;
3660
    }
3661
    return
3662
        roundAndPackFloatx80(
3663
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3664

    
3665
}
3666

    
3667
/*----------------------------------------------------------------------------
3668
| Returns the result of dividing the extended double-precision floating-point
3669
| value `a' by the corresponding value `b'.  The operation is performed
3670
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3671
*----------------------------------------------------------------------------*/
3672

    
3673
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3674
{
3675
    flag aSign, bSign, zSign;
3676
    int32 aExp, bExp, zExp;
3677
    bits64 aSig, bSig, zSig0, zSig1;
3678
    bits64 rem0, rem1, rem2, term0, term1, term2;
3679
    floatx80 z;
3680

    
3681
    aSig = extractFloatx80Frac( a );
3682
    aExp = extractFloatx80Exp( a );
3683
    aSign = extractFloatx80Sign( a );
3684
    bSig = extractFloatx80Frac( b );
3685
    bExp = extractFloatx80Exp( b );
3686
    bSign = extractFloatx80Sign( b );
3687
    zSign = aSign ^ bSign;
3688
    if ( aExp == 0x7FFF ) {
3689
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3690
        if ( bExp == 0x7FFF ) {
3691
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3692
            goto invalid;
3693
        }
3694
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3695
    }
3696
    if ( bExp == 0x7FFF ) {
3697
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3698
        return packFloatx80( zSign, 0, 0 );
3699
    }
3700
    if ( bExp == 0 ) {
3701
        if ( bSig == 0 ) {
3702
            if ( ( aExp | aSig ) == 0 ) {
3703
 invalid:
3704
                float_raise( float_flag_invalid STATUS_VAR);
3705
                z.low = floatx80_default_nan_low;
3706
                z.high = floatx80_default_nan_high;
3707
                return z;
3708
            }
3709
            float_raise( float_flag_divbyzero STATUS_VAR);
3710
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3711
        }
3712
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3713
    }
3714
    if ( aExp == 0 ) {
3715
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3716
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3717
    }
3718
    zExp = aExp - bExp + 0x3FFE;
3719
    rem1 = 0;
3720
    if ( bSig <= aSig ) {
3721
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3722
        ++zExp;
3723
    }
3724
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3725
    mul64To128( bSig, zSig0, &term0, &term1 );
3726
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3727
    while ( (sbits64) rem0 < 0 ) {
3728
        --zSig0;
3729
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3730
    }
3731
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3732
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3733
        mul64To128( bSig, zSig1, &term1, &term2 );
3734
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3735
        while ( (sbits64) rem1 < 0 ) {
3736
            --zSig1;
3737
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3738
        }
3739
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3740
    }
3741
    return
3742
        roundAndPackFloatx80(
3743
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3744

    
3745
}
3746

    
3747
/*----------------------------------------------------------------------------
3748
| Returns the remainder of the extended double-precision floating-point value
3749
| `a' with respect to the corresponding value `b'.  The operation is performed
3750
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3751
*----------------------------------------------------------------------------*/
3752

    
3753
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3754
{
3755
    flag aSign, bSign, zSign;
3756
    int32 aExp, bExp, expDiff;
3757
    bits64 aSig0, aSig1, bSig;
3758
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3759
    floatx80 z;
3760

    
3761
    aSig0 = extractFloatx80Frac( a );
3762
    aExp = extractFloatx80Exp( a );
3763
    aSign = extractFloatx80Sign( a );
3764
    bSig = extractFloatx80Frac( b );
3765
    bExp = extractFloatx80Exp( b );
3766
    bSign = extractFloatx80Sign( b );
3767
    if ( aExp == 0x7FFF ) {
3768
        if (    (bits64) ( aSig0<<1 )
3769
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3770
            return propagateFloatx80NaN( a, b STATUS_VAR );
3771
        }
3772
        goto invalid;
3773
    }
3774
    if ( bExp == 0x7FFF ) {
3775
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3776
        return a;
3777
    }
3778
    if ( bExp == 0 ) {
3779
        if ( bSig == 0 ) {
3780
 invalid:
3781
            float_raise( float_flag_invalid STATUS_VAR);
3782
            z.low = floatx80_default_nan_low;
3783
            z.high = floatx80_default_nan_high;
3784
            return z;
3785
        }
3786
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3787
    }
3788
    if ( aExp == 0 ) {
3789
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3790
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3791
    }
3792
    bSig |= LIT64( 0x8000000000000000 );
3793
    zSign = aSign;
3794
    expDiff = aExp - bExp;
3795
    aSig1 = 0;
3796
    if ( expDiff < 0 ) {
3797
        if ( expDiff < -1 ) return a;
3798
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3799
        expDiff = 0;
3800
    }
3801
    q = ( bSig <= aSig0 );
3802
    if ( q ) aSig0 -= bSig;
3803
    expDiff -= 64;
3804
    while ( 0 < expDiff ) {
3805
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3806
        q = ( 2 < q ) ? q - 2 : 0;
3807
        mul64To128( bSig, q, &term0, &term1 );
3808
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3809
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3810
        expDiff -= 62;
3811
    }
3812
    expDiff += 64;
3813
    if ( 0 < expDiff ) {
3814
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3815
        q = ( 2 < q ) ? q - 2 : 0;
3816
        q >>= 64 - expDiff;
3817
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3818
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3819
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3820
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3821
            ++q;
3822
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3823
        }
3824
    }
3825
    else {
3826
        term1 = 0;
3827
        term0 = bSig;
3828
    }
3829
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3830
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3831
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3832
              && ( q & 1 ) )
3833
       ) {
3834
        aSig0 = alternateASig0;
3835
        aSig1 = alternateASig1;
3836
        zSign = ! zSign;
3837
    }
3838
    return
3839
        normalizeRoundAndPackFloatx80(
3840
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3841

    
3842
}
3843

    
3844
/*----------------------------------------------------------------------------
3845
| Returns the square root of the extended double-precision floating-point
3846
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3847
| for Binary Floating-Point Arithmetic.
3848
*----------------------------------------------------------------------------*/
3849

    
3850
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3851
{
3852
    flag aSign;
3853
    int32 aExp, zExp;
3854
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3855
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3856
    floatx80 z;
3857

    
3858
    aSig0 = extractFloatx80Frac( a );
3859
    aExp = extractFloatx80Exp( a );
3860
    aSign = extractFloatx80Sign( a );
3861
    if ( aExp == 0x7FFF ) {
3862
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3863
        if ( ! aSign ) return a;
3864
        goto invalid;
3865
    }
3866
    if ( aSign ) {
3867
        if ( ( aExp | aSig0 ) == 0 ) return a;
3868
 invalid:
3869
        float_raise( float_flag_invalid STATUS_VAR);
3870
        z.low = floatx80_default_nan_low;
3871
        z.high = floatx80_default_nan_high;
3872
        return z;
3873
    }
3874
    if ( aExp == 0 ) {
3875
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3876
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3877
    }
3878
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3879
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3880
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3881
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3882
    doubleZSig0 = zSig0<<1;
3883
    mul64To128( zSig0, zSig0, &term0, &term1 );
3884
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3885
    while ( (sbits64) rem0 < 0 ) {
3886
        --zSig0;
3887
        doubleZSig0 -= 2;
3888
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3889
    }
3890
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3891
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3892
        if ( zSig1 == 0 ) zSig1 = 1;
3893
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3894
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3895
        mul64To128( zSig1, zSig1, &term2, &term3 );
3896
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3897
        while ( (sbits64) rem1 < 0 ) {
3898
            --zSig1;
3899
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3900
            term3 |= 1;
3901
            term2 |= doubleZSig0;
3902
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3903
        }
3904
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3905
    }
3906
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3907
    zSig0 |= doubleZSig0;
3908
    return
3909
        roundAndPackFloatx80(
3910
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3911

    
3912
}
3913

    
3914
/*----------------------------------------------------------------------------
3915
| Returns 1 if the extended double-precision floating-point value `a' is
3916
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3917
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3918
| Arithmetic.
3919
*----------------------------------------------------------------------------*/
3920

    
3921
int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3922
{
3923

    
3924
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3925
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3926
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3927
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3928
       ) {
3929
        if (    floatx80_is_signaling_nan( a )
3930
             || floatx80_is_signaling_nan( b ) ) {
3931
            float_raise( float_flag_invalid STATUS_VAR);
3932
        }
3933
        return 0;
3934
    }
3935
    return
3936
           ( a.low == b.low )
3937
        && (    ( a.high == b.high )
3938
             || (    ( a.low == 0 )
3939
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3940
           );
3941

    
3942
}
3943

    
3944
/*----------------------------------------------------------------------------
3945
| Returns 1 if the extended double-precision floating-point value `a' is
3946
| less than or equal to the corresponding value `b', and 0 otherwise.  The
3947
| comparison is performed according to the IEC/IEEE Standard for Binary
3948
| Floating-Point Arithmetic.
3949
*----------------------------------------------------------------------------*/
3950

    
3951
int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3952
{
3953
    flag aSign, bSign;
3954

    
3955
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3956
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3957
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3958
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3959
       ) {
3960
        float_raise( float_flag_invalid STATUS_VAR);
3961
        return 0;
3962
    }
3963
    aSign = extractFloatx80Sign( a );
3964
    bSign = extractFloatx80Sign( b );
3965
    if ( aSign != bSign ) {
3966
        return
3967
               aSign
3968
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3969
                 == 0 );
3970
    }
3971
    return
3972
          aSign ? le128( b.high, b.low, a.high, a.low )
3973
        : le128( a.high, a.low, b.high, b.low );
3974

    
3975
}
3976

    
3977
/*----------------------------------------------------------------------------
3978
| Returns 1 if the extended double-precision floating-point value `a' is
3979
| less than the corresponding value `b', and 0 otherwise.  The comparison
3980
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3981
| Arithmetic.
3982
*----------------------------------------------------------------------------*/
3983

    
3984
int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3985
{
3986
    flag aSign, bSign;
3987

    
3988
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3989
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3990
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3991
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3992
       ) {
3993
        float_raise( float_flag_invalid STATUS_VAR);
3994
        return 0;
3995
    }
3996
    aSign = extractFloatx80Sign( a );
3997
    bSign = extractFloatx80Sign( b );
3998
    if ( aSign != bSign ) {
3999
        return
4000
               aSign
4001
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4002
                 != 0 );
4003
    }
4004
    return
4005
          aSign ? lt128( b.high, b.low, a.high, a.low )
4006
        : lt128( a.high, a.low, b.high, b.low );
4007

    
4008
}
4009

    
4010
/*----------------------------------------------------------------------------
4011
| Returns 1 if the extended double-precision floating-point value `a' is equal
4012
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
4013
| raised if either operand is a NaN.  Otherwise, the comparison is performed
4014
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4015
*----------------------------------------------------------------------------*/
4016

    
4017
int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4018
{
4019

    
4020
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4021
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4023
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4024
       ) {
4025
        float_raise( float_flag_invalid STATUS_VAR);
4026
        return 0;
4027
    }
4028
    return
4029
           ( a.low == b.low )
4030
        && (    ( a.high == b.high )
4031
             || (    ( a.low == 0 )
4032
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4033
           );
4034

    
4035
}
4036

    
4037
/*----------------------------------------------------------------------------
4038
| Returns 1 if the extended double-precision floating-point value `a' is less
4039
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4040
| do not cause an exception.  Otherwise, the comparison is performed according
4041
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4042
*----------------------------------------------------------------------------*/
4043

    
4044
int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4045
{
4046
    flag aSign, bSign;
4047

    
4048
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4049
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4050
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4051
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4052
       ) {
4053
        if (    floatx80_is_signaling_nan( a )
4054
             || floatx80_is_signaling_nan( b ) ) {
4055
            float_raise( float_flag_invalid STATUS_VAR);
4056
        }
4057
        return 0;
4058
    }
4059
    aSign = extractFloatx80Sign( a );
4060
    bSign = extractFloatx80Sign( b );
4061
    if ( aSign != bSign ) {
4062
        return
4063
               aSign
4064
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4065
                 == 0 );
4066
    }
4067
    return
4068
          aSign ? le128( b.high, b.low, a.high, a.low )
4069
        : le128( a.high, a.low, b.high, b.low );
4070

    
4071
}
4072

    
4073
/*----------------------------------------------------------------------------
4074
| Returns 1 if the extended double-precision floating-point value `a' is less
4075
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4076
| an exception.  Otherwise, the comparison is performed according to the
4077
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4078
*----------------------------------------------------------------------------*/
4079

    
4080
int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4081
{
4082
    flag aSign, bSign;
4083

    
4084
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4085
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4086
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4087
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4088
       ) {
4089
        if (    floatx80_is_signaling_nan( a )
4090
             || floatx80_is_signaling_nan( b ) ) {
4091
            float_raise( float_flag_invalid STATUS_VAR);
4092
        }
4093
        return 0;
4094
    }
4095
    aSign = extractFloatx80Sign( a );
4096
    bSign = extractFloatx80Sign( b );
4097
    if ( aSign != bSign ) {
4098
        return
4099
               aSign
4100
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4101
                 != 0 );
4102
    }
4103
    return
4104
          aSign ? lt128( b.high, b.low, a.high, a.low )
4105
        : lt128( a.high, a.low, b.high, b.low );
4106

    
4107
}
4108

    
4109
#endif
4110

    
4111
#ifdef FLOAT128
4112

    
4113
/*----------------------------------------------------------------------------
4114
| Returns the result of converting the quadruple-precision floating-point
4115
| value `a' to the 32-bit two's complement integer format.  The conversion
4116
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4117
| Arithmetic---which means in particular that the conversion is rounded
4118
| according to the current rounding mode.  If `a' is a NaN, the largest
4119
| positive integer is returned.  Otherwise, if the conversion overflows, the
4120
| largest integer with the same sign as `a' is returned.
4121
*----------------------------------------------------------------------------*/
4122

    
4123
int32 float128_to_int32( float128 a STATUS_PARAM )
4124
{
4125
    flag aSign;
4126
    int32 aExp, shiftCount;
4127
    bits64 aSig0, aSig1;
4128

    
4129
    aSig1 = extractFloat128Frac1( a );
4130
    aSig0 = extractFloat128Frac0( a );
4131
    aExp = extractFloat128Exp( a );
4132
    aSign = extractFloat128Sign( a );
4133
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4134
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4135
    aSig0 |= ( aSig1 != 0 );
4136
    shiftCount = 0x4028 - aExp;
4137
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4138
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4139

    
4140
}
4141

    
4142
/*----------------------------------------------------------------------------
4143
| Returns the result of converting the quadruple-precision floating-point
4144
| value `a' to the 32-bit two's complement integer format.  The conversion
4145
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4146
| Arithmetic, except that the conversion is always rounded toward zero.  If
4147
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4148
| conversion overflows, the largest integer with the same sign as `a' is
4149
| returned.
4150
*----------------------------------------------------------------------------*/
4151

    
4152
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4153
{
4154
    flag aSign;
4155
    int32 aExp, shiftCount;
4156
    bits64 aSig0, aSig1, savedASig;
4157
    int32 z;
4158

    
4159
    aSig1 = extractFloat128Frac1( a );
4160
    aSig0 = extractFloat128Frac0( a );
4161
    aExp = extractFloat128Exp( a );
4162
    aSign = extractFloat128Sign( a );
4163
    aSig0 |= ( aSig1 != 0 );
4164
    if ( 0x401E < aExp ) {
4165
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4166
        goto invalid;
4167
    }
4168
    else if ( aExp < 0x3FFF ) {
4169
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4170
        return 0;
4171
    }
4172
    aSig0 |= LIT64( 0x0001000000000000 );
4173
    shiftCount = 0x402F - aExp;
4174
    savedASig = aSig0;
4175
    aSig0 >>= shiftCount;
4176
    z = aSig0;
4177
    if ( aSign ) z = - z;
4178
    if ( ( z < 0 ) ^ aSign ) {
4179
 invalid:
4180
        float_raise( float_flag_invalid STATUS_VAR);
4181
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4182
    }
4183
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4184
        STATUS(float_exception_flags) |= float_flag_inexact;
4185
    }
4186
    return z;
4187

    
4188
}
4189

    
4190
/*----------------------------------------------------------------------------
4191
| Returns the result of converting the quadruple-precision floating-point
4192
| value `a' to the 64-bit two's complement integer format.  The conversion
4193
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4194
| Arithmetic---which means in particular that the conversion is rounded
4195
| according to the current rounding mode.  If `a' is a NaN, the largest
4196
| positive integer is returned.  Otherwise, if the conversion overflows, the
4197
| largest integer with the same sign as `a' is returned.
4198
*----------------------------------------------------------------------------*/
4199

    
4200
int64 float128_to_int64( float128 a STATUS_PARAM )
4201
{
4202
    flag aSign;
4203
    int32 aExp, shiftCount;
4204
    bits64 aSig0, aSig1;
4205

    
4206
    aSig1 = extractFloat128Frac1( a );
4207
    aSig0 = extractFloat128Frac0( a );
4208
    aExp = extractFloat128Exp( a );
4209
    aSign = extractFloat128Sign( a );
4210
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4211
    shiftCount = 0x402F - aExp;
4212
    if ( shiftCount <= 0 ) {
4213
        if ( 0x403E < aExp ) {
4214
            float_raise( float_flag_invalid STATUS_VAR);
4215
            if (    ! aSign
4216
                 || (    ( aExp == 0x7FFF )
4217
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4218
                    )
4219
               ) {
4220
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4221
            }
4222
            return (sbits64) LIT64( 0x8000000000000000 );
4223
        }
4224
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4225
    }
4226
    else {
4227
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4228
    }
4229
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4230

    
4231
}
4232

    
4233
/*----------------------------------------------------------------------------
4234
| Returns the result of converting the quadruple-precision floating-point
4235
| value `a' to the 64-bit two's complement integer format.  The conversion
4236
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4237
| Arithmetic, except that the conversion is always rounded toward zero.
4238
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4239
| the conversion overflows, the largest integer with the same sign as `a' is
4240
| returned.
4241
*----------------------------------------------------------------------------*/
4242

    
4243
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4244
{
4245
    flag aSign;
4246
    int32 aExp, shiftCount;
4247
    bits64 aSig0, aSig1;
4248
    int64 z;
4249

    
4250
    aSig1 = extractFloat128Frac1( a );
4251
    aSig0 = extractFloat128Frac0( a );
4252
    aExp = extractFloat128Exp( a );
4253
    aSign = extractFloat128Sign( a );
4254
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4255
    shiftCount = aExp - 0x402F;
4256
    if ( 0 < shiftCount ) {
4257
        if ( 0x403E <= aExp ) {
4258
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4259
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4260
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4261
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4262
            }
4263
            else {
4264
                float_raise( float_flag_invalid STATUS_VAR);
4265
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4266
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4267
                }
4268
            }
4269
            return (sbits64) LIT64( 0x8000000000000000 );
4270
        }
4271
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4272
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4273
            STATUS(float_exception_flags) |= float_flag_inexact;
4274
        }
4275
    }
4276
    else {
4277
        if ( aExp < 0x3FFF ) {
4278
            if ( aExp | aSig0 | aSig1 ) {
4279
                STATUS(float_exception_flags) |= float_flag_inexact;
4280
            }
4281
            return 0;
4282
        }
4283
        z = aSig0>>( - shiftCount );
4284
        if (    aSig1
4285
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4286
            STATUS(float_exception_flags) |= float_flag_inexact;
4287
        }
4288
    }
4289
    if ( aSign ) z = - z;
4290
    return z;
4291

    
4292
}
4293

    
4294
/*----------------------------------------------------------------------------
4295
| Returns the result of converting the quadruple-precision floating-point
4296
| value `a' to the single-precision floating-point format.  The conversion
4297
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4298
| Arithmetic.
4299
*----------------------------------------------------------------------------*/
4300

    
4301
float32 float128_to_float32( float128 a STATUS_PARAM )
4302
{
4303
    flag aSign;
4304
    int32 aExp;
4305
    bits64 aSig0, aSig1;
4306
    bits32 zSig;
4307

    
4308
    aSig1 = extractFloat128Frac1( a );
4309
    aSig0 = extractFloat128Frac0( a );
4310
    aExp = extractFloat128Exp( a );
4311
    aSign = extractFloat128Sign( a );
4312
    if ( aExp == 0x7FFF ) {
4313
        if ( aSig0 | aSig1 ) {
4314
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4315
        }
4316
        return packFloat32( aSign, 0xFF, 0 );
4317
    }
4318
    aSig0 |= ( aSig1 != 0 );
4319
    shift64RightJamming( aSig0, 18, &aSig0 );
4320
    zSig = aSig0;
4321
    if ( aExp || zSig ) {
4322
        zSig |= 0x40000000;
4323
        aExp -= 0x3F81;
4324
    }
4325
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4326

    
4327
}
4328

    
4329
/*----------------------------------------------------------------------------
4330
| Returns the result of converting the quadruple-precision floating-point
4331
| value `a' to the double-precision floating-point format.  The conversion
4332
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333
| Arithmetic.
4334
*----------------------------------------------------------------------------*/
4335

    
4336
float64 float128_to_float64( float128 a STATUS_PARAM )
4337
{
4338
    flag aSign;
4339
    int32 aExp;
4340
    bits64 aSig0, aSig1;
4341

    
4342
    aSig1 = extractFloat128Frac1( a );
4343
    aSig0 = extractFloat128Frac0( a );
4344
    aExp = extractFloat128Exp( a );
4345
    aSign = extractFloat128Sign( a );
4346
    if ( aExp == 0x7FFF ) {
4347
        if ( aSig0 | aSig1 ) {
4348
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4349
        }
4350
        return packFloat64( aSign, 0x7FF, 0 );
4351
    }
4352
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4353
    aSig0 |= ( aSig1 != 0 );
4354
    if ( aExp || aSig0 ) {
4355
        aSig0 |= LIT64( 0x4000000000000000 );
4356
        aExp -= 0x3C01;
4357
    }
4358
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4359

    
4360
}
4361

    
4362
#ifdef FLOATX80
4363

    
4364
/*----------------------------------------------------------------------------
4365
| Returns the result of converting the quadruple-precision floating-point
4366
| value `a' to the extended double-precision floating-point format.  The
4367
| conversion is performed according to the IEC/IEEE Standard for Binary
4368
| Floating-Point Arithmetic.
4369
*----------------------------------------------------------------------------*/
4370

    
4371
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4372
{
4373
    flag aSign;
4374
    int32 aExp;
4375
    bits64 aSig0, aSig1;
4376

    
4377
    aSig1 = extractFloat128Frac1( a );
4378
    aSig0 = extractFloat128Frac0( a );
4379
    aExp = extractFloat128Exp( a );
4380
    aSign = extractFloat128Sign( a );
4381
    if ( aExp == 0x7FFF ) {
4382
        if ( aSig0 | aSig1 ) {
4383
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4384
        }
4385
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4386
    }
4387
    if ( aExp == 0 ) {
4388
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4389
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4390
    }
4391
    else {
4392
        aSig0 |= LIT64( 0x0001000000000000 );
4393
    }
4394
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4395
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4396

    
4397
}
4398

    
4399
#endif
4400

    
4401
/*----------------------------------------------------------------------------
4402
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4403
| returns the result as a quadruple-precision floating-point value.  The
4404
| operation is performed according to the IEC/IEEE Standard for Binary
4405
| Floating-Point Arithmetic.
4406
*----------------------------------------------------------------------------*/
4407

    
4408
float128 float128_round_to_int( float128 a STATUS_PARAM )
4409
{
4410
    flag aSign;
4411
    int32 aExp;
4412
    bits64 lastBitMask, roundBitsMask;
4413
    int8 roundingMode;
4414
    float128 z;
4415

    
4416
    aExp = extractFloat128Exp( a );
4417
    if ( 0x402F <= aExp ) {
4418
        if ( 0x406F <= aExp ) {
4419
            if (    ( aExp == 0x7FFF )
4420
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4421
               ) {
4422
                return propagateFloat128NaN( a, a STATUS_VAR );
4423
            }
4424
            return a;
4425
        }
4426
        lastBitMask = 1;
4427
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4428
        roundBitsMask = lastBitMask - 1;
4429
        z = a;
4430
        roundingMode = STATUS(float_rounding_mode);
4431
        if ( roundingMode == float_round_nearest_even ) {
4432
            if ( lastBitMask ) {
4433
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4434
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4435
            }
4436
            else {
4437
                if ( (sbits64) z.low < 0 ) {
4438
                    ++z.high;
4439
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4440
                }
4441
            }
4442
        }
4443
        else if ( roundingMode != float_round_to_zero ) {
4444
            if (   extractFloat128Sign( z )
4445
                 ^ ( roundingMode == float_round_up ) ) {
4446
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4447
            }
4448
        }
4449
        z.low &= ~ roundBitsMask;
4450
    }
4451
    else {
4452
        if ( aExp < 0x3FFF ) {
4453
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4454
            STATUS(float_exception_flags) |= float_flag_inexact;
4455
            aSign = extractFloat128Sign( a );
4456
            switch ( STATUS(float_rounding_mode) ) {
4457
             case float_round_nearest_even:
4458
                if (    ( aExp == 0x3FFE )
4459
                     && (   extractFloat128Frac0( a )
4460
                          | extractFloat128Frac1( a ) )
4461
                   ) {
4462
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4463
                }
4464
                break;
4465
             case float_round_down:
4466
                return
4467
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4468
                    : packFloat128( 0, 0, 0, 0 );
4469
             case float_round_up:
4470
                return
4471
                      aSign ? packFloat128( 1, 0, 0, 0 )
4472
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4473
            }
4474
            return packFloat128( aSign, 0, 0, 0 );
4475
        }
4476
        lastBitMask = 1;
4477
        lastBitMask <<= 0x402F - aExp;
4478
        roundBitsMask = lastBitMask - 1;
4479
        z.low = 0;
4480
        z.high = a.high;
4481
        roundingMode = STATUS(float_rounding_mode);
4482
        if ( roundingMode == float_round_nearest_even ) {
4483
            z.high += lastBitMask>>1;
4484
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4485
                z.high &= ~ lastBitMask;
4486
            }
4487
        }
4488
        else if ( roundingMode != float_round_to_zero ) {
4489
            if (   extractFloat128Sign( z )
4490
                 ^ ( roundingMode == float_round_up ) ) {
4491
                z.high |= ( a.low != 0 );
4492
                z.high += roundBitsMask;
4493
            }
4494
        }
4495
        z.high &= ~ roundBitsMask;
4496
    }
4497
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4498
        STATUS(float_exception_flags) |= float_flag_inexact;
4499
    }
4500
    return z;
4501

    
4502
}
4503

    
4504
/*----------------------------------------------------------------------------
4505
| Returns the result of adding the absolute values of the quadruple-precision
4506
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4507
| before being returned.  `zSign' is ignored if the result is a NaN.
4508
| The addition is performed according to the IEC/IEEE Standard for Binary
4509
| Floating-Point Arithmetic.
4510
*----------------------------------------------------------------------------*/
4511

    
4512
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4513
{
4514
    int32 aExp, bExp, zExp;
4515
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4516
    int32 expDiff;
4517

    
4518
    aSig1 = extractFloat128Frac1( a );
4519
    aSig0 = extractFloat128Frac0( a );
4520
    aExp = extractFloat128Exp( a );
4521
    bSig1 = extractFloat128Frac1( b );
4522
    bSig0 = extractFloat128Frac0( b );
4523
    bExp = extractFloat128Exp( b );
4524
    expDiff = aExp - bExp;
4525
    if ( 0 < expDiff ) {
4526
        if ( aExp == 0x7FFF ) {
4527
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4528
            return a;
4529
        }
4530
        if ( bExp == 0 ) {
4531
            --expDiff;
4532
        }
4533
        else {
4534
            bSig0 |= LIT64( 0x0001000000000000 );
4535
        }
4536
        shift128ExtraRightJamming(
4537
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4538
        zExp = aExp;
4539
    }
4540
    else if ( expDiff < 0 ) {
4541
        if ( bExp == 0x7FFF ) {
4542
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4543
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4544
        }
4545
        if ( aExp == 0 ) {
4546
            ++expDiff;
4547
        }
4548
        else {
4549
            aSig0 |= LIT64( 0x0001000000000000 );
4550
        }
4551
        shift128ExtraRightJamming(
4552
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4553
        zExp = bExp;
4554
    }
4555
    else {
4556
        if ( aExp == 0x7FFF ) {
4557
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4558
                return propagateFloat128NaN( a, b STATUS_VAR );
4559
            }
4560
            return a;
4561
        }
4562
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4563
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4564
        zSig2 = 0;
4565
        zSig0 |= LIT64( 0x0002000000000000 );
4566
        zExp = aExp;
4567
        goto shiftRight1;
4568
    }
4569
    aSig0 |= LIT64( 0x0001000000000000 );
4570
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4571
    --zExp;
4572
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4573
    ++zExp;
4574
 shiftRight1:
4575
    shift128ExtraRightJamming(
4576
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4577
 roundAndPack:
4578
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4579

    
4580
}
4581

    
4582
/*----------------------------------------------------------------------------
4583
| Returns the result of subtracting the absolute values of the quadruple-
4584
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4585
| difference is negated before being returned.  `zSign' is ignored if the
4586
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4587
| Standard for Binary Floating-Point Arithmetic.
4588
*----------------------------------------------------------------------------*/
4589

    
4590
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4591
{
4592
    int32 aExp, bExp, zExp;
4593
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4594
    int32 expDiff;
4595
    float128 z;
4596

    
4597
    aSig1 = extractFloat128Frac1( a );
4598
    aSig0 = extractFloat128Frac0( a );
4599
    aExp = extractFloat128Exp( a );
4600
    bSig1 = extractFloat128Frac1( b );
4601
    bSig0 = extractFloat128Frac0( b );
4602
    bExp = extractFloat128Exp( b );
4603
    expDiff = aExp - bExp;
4604
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4606
    if ( 0 < expDiff ) goto aExpBigger;
4607
    if ( expDiff < 0 ) goto bExpBigger;
4608
    if ( aExp == 0x7FFF ) {
4609
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4610
            return propagateFloat128NaN( a, b STATUS_VAR );
4611
        }
4612
        float_raise( float_flag_invalid STATUS_VAR);
4613
        z.low = float128_default_nan_low;
4614
        z.high = float128_default_nan_high;
4615
        return z;
4616
    }
4617
    if ( aExp == 0 ) {
4618
        aExp = 1;
4619
        bExp = 1;
4620
    }
4621
    if ( bSig0 < aSig0 ) goto aBigger;
4622
    if ( aSig0 < bSig0 ) goto bBigger;
4623
    if ( bSig1 < aSig1 ) goto aBigger;
4624
    if ( aSig1 < bSig1 ) goto bBigger;
4625
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4626
 bExpBigger:
4627
    if ( bExp == 0x7FFF ) {
4628
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4629
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4630
    }
4631
    if ( aExp == 0 ) {
4632
        ++expDiff;
4633
    }
4634
    else {
4635
        aSig0 |= LIT64( 0x4000000000000000 );
4636
    }
4637
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4638
    bSig0 |= LIT64( 0x4000000000000000 );
4639
 bBigger:
4640
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4641
    zExp = bExp;
4642
    zSign ^= 1;
4643
    goto normalizeRoundAndPack;
4644
 aExpBigger:
4645
    if ( aExp == 0x7FFF ) {
4646
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4647
        return a;
4648
    }
4649
    if ( bExp == 0 ) {
4650
        --expDiff;
4651
    }
4652
    else {
4653
        bSig0 |= LIT64( 0x4000000000000000 );
4654
    }
4655
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4656
    aSig0 |= LIT64( 0x4000000000000000 );
4657
 aBigger:
4658
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4659
    zExp = aExp;
4660
 normalizeRoundAndPack:
4661
    --zExp;
4662
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4663

    
4664
}
4665

    
4666
/*----------------------------------------------------------------------------
4667
| Returns the result of adding the quadruple-precision floating-point values
4668
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4669
| for Binary Floating-Point Arithmetic.
4670
*----------------------------------------------------------------------------*/
4671

    
4672
float128 float128_add( float128 a, float128 b STATUS_PARAM )
4673
{
4674
    flag aSign, bSign;
4675

    
4676
    aSign = extractFloat128Sign( a );
4677
    bSign = extractFloat128Sign( b );
4678
    if ( aSign == bSign ) {
4679
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4680
    }
4681
    else {
4682
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4683
    }
4684

    
4685
}
4686

    
4687
/*----------------------------------------------------------------------------
4688
| Returns the result of subtracting the quadruple-precision floating-point
4689
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4690
| Standard for Binary Floating-Point Arithmetic.
4691
*----------------------------------------------------------------------------*/
4692

    
4693
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4694
{
4695
    flag aSign, bSign;
4696

    
4697
    aSign = extractFloat128Sign( a );
4698
    bSign = extractFloat128Sign( b );
4699
    if ( aSign == bSign ) {
4700
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4701
    }
4702
    else {
4703
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4704
    }
4705

    
4706
}
4707

    
4708
/*----------------------------------------------------------------------------
4709
| Returns the result of multiplying the quadruple-precision floating-point
4710
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4711
| Standard for Binary Floating-Point Arithmetic.
4712
*----------------------------------------------------------------------------*/
4713

    
4714
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4715
{
4716
    flag aSign, bSign, zSign;
4717
    int32 aExp, bExp, zExp;
4718
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4719
    float128 z;
4720

    
4721
    aSig1 = extractFloat128Frac1( a );
4722
    aSig0 = extractFloat128Frac0( a );
4723
    aExp = extractFloat128Exp( a );
4724
    aSign = extractFloat128Sign( a );
4725
    bSig1 = extractFloat128Frac1( b );
4726
    bSig0 = extractFloat128Frac0( b );
4727
    bExp = extractFloat128Exp( b );
4728
    bSign = extractFloat128Sign( b );
4729
    zSign = aSign ^ bSign;
4730
    if ( aExp == 0x7FFF ) {
4731
        if (    ( aSig0 | aSig1 )
4732
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4733
            return propagateFloat128NaN( a, b STATUS_VAR );
4734
        }
4735
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4736
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4737
    }
4738
    if ( bExp == 0x7FFF ) {
4739
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4740
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4741
 invalid:
4742
            float_raise( float_flag_invalid STATUS_VAR);
4743
            z.low = float128_default_nan_low;
4744
            z.high = float128_default_nan_high;
4745
            return z;
4746
        }
4747
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4748
    }
4749
    if ( aExp == 0 ) {
4750
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4751
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4752
    }
4753
    if ( bExp == 0 ) {
4754
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4755
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4756
    }
4757
    zExp = aExp + bExp - 0x4000;
4758
    aSig0 |= LIT64( 0x0001000000000000 );
4759
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4760
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4761
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4762
    zSig2 |= ( zSig3 != 0 );
4763
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4764
        shift128ExtraRightJamming(
4765
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4766
        ++zExp;
4767
    }
4768
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4769

    
4770
}
4771

    
4772
/*----------------------------------------------------------------------------
4773
| Returns the result of dividing the quadruple-precision floating-point value
4774
| `a' by the corresponding value `b'.  The operation is performed according to
4775
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4776
*----------------------------------------------------------------------------*/
4777

    
4778
float128 float128_div( float128 a, float128 b STATUS_PARAM )
4779
{
4780
    flag aSign, bSign, zSign;
4781
    int32 aExp, bExp, zExp;
4782
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4783
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4784
    float128 z;
4785

    
4786
    aSig1 = extractFloat128Frac1( a );
4787
    aSig0 = extractFloat128Frac0( a );
4788
    aExp = extractFloat128Exp( a );
4789
    aSign = extractFloat128Sign( a );
4790
    bSig1 = extractFloat128Frac1( b );
4791
    bSig0 = extractFloat128Frac0( b );
4792
    bExp = extractFloat128Exp( b );
4793
    bSign = extractFloat128Sign( b );
4794
    zSign = aSign ^ bSign;
4795
    if ( aExp == 0x7FFF ) {
4796
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4797
        if ( bExp == 0x7FFF ) {
4798
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4799
            goto invalid;
4800
        }
4801
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4802
    }
4803
    if ( bExp == 0x7FFF ) {
4804
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4805
        return packFloat128( zSign, 0, 0, 0 );
4806
    }
4807
    if ( bExp == 0 ) {
4808
        if ( ( bSig0 | bSig1 ) == 0 ) {
4809
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4810
 invalid:
4811
                float_raise( float_flag_invalid STATUS_VAR);
4812
                z.low = float128_default_nan_low;
4813
                z.high = float128_default_nan_high;
4814
                return z;
4815
            }
4816
            float_raise( float_flag_divbyzero STATUS_VAR);
4817
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4818
        }
4819
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4820
    }
4821
    if ( aExp == 0 ) {
4822
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4823
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4824
    }
4825
    zExp = aExp - bExp + 0x3FFD;
4826
    shortShift128Left(
4827
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4828
    shortShift128Left(
4829
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4830
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4831
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4832
        ++zExp;
4833
    }
4834
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4835
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4836
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4837
    while ( (sbits64) rem0 < 0 ) {
4838
        --zSig0;
4839
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4840
    }
4841
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4842
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4843
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4844
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4845
        while ( (sbits64) rem1 < 0 ) {
4846
            --zSig1;
4847
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4848
        }
4849
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4850
    }
4851
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4852
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4853

    
4854
}
4855

    
4856
/*----------------------------------------------------------------------------
4857
| Returns the remainder of the quadruple-precision floating-point value `a'
4858
| with respect to the corresponding value `b'.  The operation is performed
4859
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4860
*----------------------------------------------------------------------------*/
4861

    
4862
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4863
{
4864
    flag aSign, bSign, zSign;
4865
    int32 aExp, bExp, expDiff;
4866
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4867
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4868
    sbits64 sigMean0;
4869
    float128 z;
4870

    
4871
    aSig1 = extractFloat128Frac1( a );
4872
    aSig0 = extractFloat128Frac0( a );
4873
    aExp = extractFloat128Exp( a );
4874
    aSign = extractFloat128Sign( a );
4875
    bSig1 = extractFloat128Frac1( b );
4876
    bSig0 = extractFloat128Frac0( b );
4877
    bExp = extractFloat128Exp( b );
4878
    bSign = extractFloat128Sign( b );
4879
    if ( aExp == 0x7FFF ) {
4880
        if (    ( aSig0 | aSig1 )
4881
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4882
            return propagateFloat128NaN( a, b STATUS_VAR );
4883
        }
4884
        goto invalid;
4885
    }
4886
    if ( bExp == 0x7FFF ) {
4887
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4888
        return a;
4889
    }
4890
    if ( bExp == 0 ) {
4891
        if ( ( bSig0 | bSig1 ) == 0 ) {
4892
 invalid:
4893
            float_raise( float_flag_invalid STATUS_VAR);
4894
            z.low = float128_default_nan_low;
4895
            z.high = float128_default_nan_high;
4896
            return z;
4897
        }
4898
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4899
    }
4900
    if ( aExp == 0 ) {
4901
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4902
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4903
    }
4904
    expDiff = aExp - bExp;
4905
    if ( expDiff < -1 ) return a;
4906
    shortShift128Left(
4907
        aSig0 | LIT64( 0x0001000000000000 ),
4908
        aSig1,
4909
        15 - ( expDiff < 0 ),
4910
        &aSig0,
4911
        &aSig1
4912
    );
4913
    shortShift128Left(
4914
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4915
    q = le128( bSig0, bSig1, aSig0, aSig1 );
4916
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4917
    expDiff -= 64;
4918
    while ( 0 < expDiff ) {
4919
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4920
        q = ( 4 < q ) ? q - 4 : 0;
4921
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4922
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4923
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4924
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4925
        expDiff -= 61;
4926
    }
4927
    if ( -64 < expDiff ) {
4928
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4929
        q = ( 4 < q ) ? q - 4 : 0;
4930
        q >>= - expDiff;
4931
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4932
        expDiff += 52;
4933
        if ( expDiff < 0 ) {
4934
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4935
        }
4936
        else {
4937
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4938
        }
4939
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4940
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4941
    }
4942
    else {
4943
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4944
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4945
    }
4946
    do {
4947
        alternateASig0 = aSig0;
4948
        alternateASig1 = aSig1;
4949
        ++q;
4950
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4951
    } while ( 0 <= (sbits64) aSig0 );
4952
    add128(
4953
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4954
    if (    ( sigMean0 < 0 )
4955
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4956
        aSig0 = alternateASig0;
4957
        aSig1 = alternateASig1;
4958
    }
4959
    zSign = ( (sbits64) aSig0 < 0 );
4960
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4961
    return
4962
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4963

    
4964
}
4965

    
4966
/*----------------------------------------------------------------------------
4967
| Returns the square root of the quadruple-precision floating-point value `a'.
4968
| The operation is performed according to the IEC/IEEE Standard for Binary
4969
| Floating-Point Arithmetic.
4970
*----------------------------------------------------------------------------*/
4971

    
4972
float128 float128_sqrt( float128 a STATUS_PARAM )
4973
{
4974
    flag aSign;
4975
    int32 aExp, zExp;
4976
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4977
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4978
    float128 z;
4979

    
4980
    aSig1 = extractFloat128Frac1( a );
4981
    aSig0 = extractFloat128Frac0( a );
4982
    aExp = extractFloat128Exp( a );
4983
    aSign = extractFloat128Sign( a );
4984
    if ( aExp == 0x7FFF ) {
4985
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4986
        if ( ! aSign ) return a;
4987
        goto invalid;
4988
    }
4989
    if ( aSign ) {
4990
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4991
 invalid:
4992
        float_raise( float_flag_invalid STATUS_VAR);
4993
        z.low = float128_default_nan_low;
4994
        z.high = float128_default_nan_high;
4995
        return z;
4996
    }
4997
    if ( aExp == 0 ) {
4998
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4999
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5000
    }
5001
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5002
    aSig0 |= LIT64( 0x0001000000000000 );
5003
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5004
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5005
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5006
    doubleZSig0 = zSig0<<1;
5007
    mul64To128( zSig0, zSig0, &term0, &term1 );
5008
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5009
    while ( (sbits64) rem0 < 0 ) {
5010
        --zSig0;
5011
        doubleZSig0 -= 2;
5012
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5013
    }
5014
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5015
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5016
        if ( zSig1 == 0 ) zSig1 = 1;
5017
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5018
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5019
        mul64To128( zSig1, zSig1, &term2, &term3 );
5020
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5021
        while ( (sbits64) rem1 < 0 ) {
5022
            --zSig1;
5023
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5024
            term3 |= 1;
5025
            term2 |= doubleZSig0;
5026
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5027
        }
5028
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5029
    }
5030
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5031
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5032

    
5033
}
5034

    
5035
/*----------------------------------------------------------------------------
5036
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5037
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5038
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039
*----------------------------------------------------------------------------*/
5040

    
5041
int float128_eq( float128 a, float128 b STATUS_PARAM )
5042
{
5043

    
5044
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5045
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5046
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5047
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5048
       ) {
5049
        if (    float128_is_signaling_nan( a )
5050
             || float128_is_signaling_nan( b ) ) {
5051
            float_raise( float_flag_invalid STATUS_VAR);
5052
        }
5053
        return 0;
5054
    }
5055
    return
5056
           ( a.low == b.low )
5057
        && (    ( a.high == b.high )
5058
             || (    ( a.low == 0 )
5059
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5060
           );
5061

    
5062
}
5063

    
5064
/*----------------------------------------------------------------------------
5065
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5066
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5067
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5068
| Arithmetic.
5069
*----------------------------------------------------------------------------*/
5070

    
5071
int float128_le( float128 a, float128 b STATUS_PARAM )
5072
{
5073
    flag aSign, bSign;
5074

    
5075
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5076
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5077
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5078
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5079
       ) {
5080
        float_raise( float_flag_invalid STATUS_VAR);
5081
        return 0;
5082
    }
5083
    aSign = extractFloat128Sign( a );
5084
    bSign = extractFloat128Sign( b );
5085
    if ( aSign != bSign ) {
5086
        return
5087
               aSign
5088
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5089
                 == 0 );
5090
    }
5091
    return
5092
          aSign ? le128( b.high, b.low, a.high, a.low )
5093
        : le128( a.high, a.low, b.high, b.low );
5094

    
5095
}
5096

    
5097
/*----------------------------------------------------------------------------
5098
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5099
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5100
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101
*----------------------------------------------------------------------------*/
5102

    
5103
int float128_lt( float128 a, float128 b STATUS_PARAM )
5104
{
5105
    flag aSign, bSign;
5106

    
5107
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5108
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5109
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5110
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5111
       ) {
5112
        float_raise( float_flag_invalid STATUS_VAR);
5113
        return 0;
5114
    }
5115
    aSign = extractFloat128Sign( a );
5116
    bSign = extractFloat128Sign( b );
5117
    if ( aSign != bSign ) {
5118
        return
5119
               aSign
5120
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5121
                 != 0 );
5122
    }
5123
    return
5124
          aSign ? lt128( b.high, b.low, a.high, a.low )
5125
        : lt128( a.high, a.low, b.high, b.low );
5126

    
5127
}
5128

    
5129
/*----------------------------------------------------------------------------
5130
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5131
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5132
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5133
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5134
*----------------------------------------------------------------------------*/
5135

    
5136
int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5137
{
5138

    
5139
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5140
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5141
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5142
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5143
       ) {
5144
        float_raise( float_flag_invalid STATUS_VAR);
5145
        return 0;
5146
    }
5147
    return
5148
           ( a.low == b.low )
5149
        && (    ( a.high == b.high )
5150
             || (    ( a.low == 0 )
5151
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5152
           );
5153

    
5154
}
5155

    
5156
/*----------------------------------------------------------------------------
5157
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5158
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5159
| cause an exception.  Otherwise, the comparison is performed according to the
5160
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5161
*----------------------------------------------------------------------------*/
5162

    
5163
int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5164
{
5165
    flag aSign, bSign;
5166

    
5167
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5168
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5169
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5170
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5171
       ) {
5172
        if (    float128_is_signaling_nan( a )
5173
             || float128_is_signaling_nan( b ) ) {
5174
            float_raise( float_flag_invalid STATUS_VAR);
5175
        }
5176
        return 0;
5177
    }
5178
    aSign = extractFloat128Sign( a );
5179
    bSign = extractFloat128Sign( b );
5180
    if ( aSign != bSign ) {
5181
        return
5182
               aSign
5183
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5184
                 == 0 );
5185
    }
5186
    return
5187
          aSign ? le128( b.high, b.low, a.high, a.low )
5188
        : le128( a.high, a.low, b.high, b.low );
5189

    
5190
}
5191

    
5192
/*----------------------------------------------------------------------------
5193
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5194
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5195
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5196
| Standard for Binary Floating-Point Arithmetic.
5197
*----------------------------------------------------------------------------*/
5198

    
5199
int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5200
{
5201
    flag aSign, bSign;
5202

    
5203
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5204
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5205
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5206
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5207
       ) {
5208
        if (    float128_is_signaling_nan( a )
5209
             || float128_is_signaling_nan( b ) ) {
5210
            float_raise( float_flag_invalid STATUS_VAR);
5211
        }
5212
        return 0;
5213
    }
5214
    aSign = extractFloat128Sign( a );
5215
    bSign = extractFloat128Sign( b );
5216
    if ( aSign != bSign ) {
5217
        return
5218
               aSign
5219
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5220
                 != 0 );
5221
    }
5222
    return
5223
          aSign ? lt128( b.high, b.low, a.high, a.low )
5224
        : lt128( a.high, a.low, b.high, b.low );
5225

    
5226
}
5227

    
5228
#endif
5229

    
5230
/* misc functions */
5231
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5232
{
5233
    return int64_to_float32(a STATUS_VAR);
5234
}
5235

    
5236
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5237
{
5238
    return int64_to_float64(a STATUS_VAR);
5239
}
5240

    
5241
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5242
{
5243
    int64_t v;
5244
    unsigned int res;
5245

    
5246
    v = float32_to_int64(a STATUS_VAR);
5247
    if (v < 0) {
5248
        res = 0;
5249
        float_raise( float_flag_invalid STATUS_VAR);
5250
    } else if (v > 0xffffffff) {
5251
        res = 0xffffffff;
5252
        float_raise( float_flag_invalid STATUS_VAR);
5253
    } else {
5254
        res = v;
5255
    }
5256
    return res;
5257
}
5258

    
5259
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5260
{
5261
    int64_t v;
5262
    unsigned int res;
5263

    
5264
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5265
    if (v < 0) {
5266
        res = 0;
5267
        float_raise( float_flag_invalid STATUS_VAR);
5268
    } else if (v > 0xffffffff) {
5269
        res = 0xffffffff;
5270
        float_raise( float_flag_invalid STATUS_VAR);
5271
    } else {
5272
        res = v;
5273
    }
5274
    return res;
5275
}
5276

    
5277
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5278
{
5279
    int64_t v;
5280
    unsigned int res;
5281

    
5282
    v = float64_to_int64(a STATUS_VAR);
5283
    if (v < 0) {
5284
        res = 0;
5285
        float_raise( float_flag_invalid STATUS_VAR);
5286
    } else if (v > 0xffffffff) {
5287
        res = 0xffffffff;
5288
        float_raise( float_flag_invalid STATUS_VAR);
5289
    } else {
5290
        res = v;
5291
    }
5292
    return res;
5293
}
5294

    
5295
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5296
{
5297
    int64_t v;
5298
    unsigned int res;
5299

    
5300
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5301
    if (v < 0) {
5302
        res = 0;
5303
        float_raise( float_flag_invalid STATUS_VAR);
5304
    } else if (v > 0xffffffff) {
5305
        res = 0xffffffff;
5306
        float_raise( float_flag_invalid STATUS_VAR);
5307
    } else {
5308
        res = v;
5309
    }
5310
    return res;
5311
}
5312

    
5313
uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5314
{
5315
    int64_t v;
5316

    
5317
    v = int64_to_float64(INT64_MIN STATUS_VAR);
5318
    v = float64_to_int64((a + v) STATUS_VAR);
5319

    
5320
    return v - INT64_MIN;
5321
}
5322

    
5323
uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5324
{
5325
    int64_t v;
5326

    
5327
    v = int64_to_float64(INT64_MIN STATUS_VAR);
5328
    v = float64_to_int64_round_to_zero((a + v) STATUS_VAR);
5329

    
5330
    return v - INT64_MIN;
5331
}
5332

    
5333
#define COMPARE(s, nan_exp)                                                  \
5334
INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5335
                                      int is_quiet STATUS_PARAM )            \
5336
{                                                                            \
5337
    flag aSign, bSign;                                                       \
5338
                                                                             \
5339
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5340
         extractFloat ## s ## Frac( a ) ) ||                                 \
5341
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5342
          extractFloat ## s ## Frac( b ) )) {                                \
5343
        if (!is_quiet ||                                                     \
5344
            float ## s ## _is_signaling_nan( a ) ||                          \
5345
            float ## s ## _is_signaling_nan( b ) ) {                         \
5346
            float_raise( float_flag_invalid STATUS_VAR);                     \
5347
        }                                                                    \
5348
        return float_relation_unordered;                                     \
5349
    }                                                                        \
5350
    aSign = extractFloat ## s ## Sign( a );                                  \
5351
    bSign = extractFloat ## s ## Sign( b );                                  \
5352
    if ( aSign != bSign ) {                                                  \
5353
        if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) {                           \
5354
            /* zero case */                                                  \
5355
            return float_relation_equal;                                     \
5356
        } else {                                                             \
5357
            return 1 - (2 * aSign);                                          \
5358
        }                                                                    \
5359
    } else {                                                                 \
5360
        if (a == b) {                                                        \
5361
            return float_relation_equal;                                     \
5362
        } else {                                                             \
5363
            return 1 - 2 * (aSign ^ ( a < b ));                              \
5364
        }                                                                    \
5365
    }                                                                        \
5366
}                                                                            \
5367
                                                                             \
5368
int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5369
{                                                                            \
5370
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5371
}                                                                            \
5372
                                                                             \
5373
int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5374
{                                                                            \
5375
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5376
}
5377

    
5378
COMPARE(32, 0xff)
5379
COMPARE(64, 0x7ff)
5380

    
5381
/* Multiply A by 2 raised to the power N.  */
5382
float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5383
{
5384
    flag aSign;
5385
    int16 aExp;
5386
    bits32 aSig;
5387

    
5388
    aSig = extractFloat32Frac( a );
5389
    aExp = extractFloat32Exp( a );
5390
    aSign = extractFloat32Sign( a );
5391

    
5392
    if ( aExp == 0xFF ) {
5393
        return a;
5394
    }
5395
    aExp += n;
5396
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5397
}
5398

    
5399
float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5400
{
5401
    flag aSign;
5402
    int16 aExp;
5403
    bits64 aSig;
5404

    
5405
    aSig = extractFloat64Frac( a );
5406
    aExp = extractFloat64Exp( a );
5407
    aSign = extractFloat64Sign( a );
5408

    
5409
    if ( aExp == 0x7FF ) {
5410
        return a;
5411
    }
5412
    aExp += n;
5413
    return roundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5414
}
5415

    
5416
#ifdef FLOATX80
5417
floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5418
{
5419
    flag aSign;
5420
    int16 aExp;
5421
    bits64 aSig;
5422

    
5423
    aSig = extractFloatx80Frac( a );
5424
    aExp = extractFloatx80Exp( a );
5425
    aSign = extractFloatx80Sign( a );
5426

    
5427
    if ( aExp == 0x7FF ) {
5428
        return a;
5429
    }
5430
    aExp += n;
5431
    return roundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5432
                                 aSign, aExp, aSig, 0 STATUS_VAR );
5433
}
5434
#endif
5435

    
5436
#ifdef FLOAT128
5437
float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5438
{
5439
    flag aSign;
5440
    int32 aExp;
5441
    bits64 aSig0, aSig1;
5442

    
5443
    aSig1 = extractFloat128Frac1( a );
5444
    aSig0 = extractFloat128Frac0( a );
5445
    aExp = extractFloat128Exp( a );
5446
    aSign = extractFloat128Sign( a );
5447
    if ( aExp == 0x7FFF ) {
5448
        return a;
5449
    }
5450
    aExp += n;
5451
    return roundAndPackFloat128( aSign, aExp, aSig0, aSig1, 0 STATUS_VAR );
5452

    
5453
}
5454
#endif