Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ e6e5906b

History | View | Annotate | Download (188.5 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
#include "softfloat.h"
34

    
35
/*----------------------------------------------------------------------------
36
| Primitive arithmetic functions, including multi-word arithmetic, and
37
| division and square root approximations.  (Can be specialized to target if
38
| desired.)
39
*----------------------------------------------------------------------------*/
40
#include "softfloat-macros.h"
41

    
42
/*----------------------------------------------------------------------------
43
| Functions and definitions to determine:  (1) whether tininess for underflow
44
| is detected before or after rounding by default, (2) what (if anything)
45
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47
| are propagated from function inputs to output.  These details are target-
48
| specific.
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-specialize.h"
51

    
52
void set_float_rounding_mode(int val STATUS_PARAM)
53
{
54
    STATUS(float_rounding_mode) = val;
55
}
56

    
57
void set_float_exception_flags(int val STATUS_PARAM)
58
{
59
    STATUS(float_exception_flags) = val;
60
}
61

    
62
#ifdef FLOATX80
63
void set_floatx80_rounding_precision(int val STATUS_PARAM)
64
{
65
    STATUS(floatx80_rounding_precision) = val;
66
}
67
#endif
68

    
69
/*----------------------------------------------------------------------------
70
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71
| and 7, and returns the properly rounded 32-bit integer corresponding to the
72
| input.  If `zSign' is 1, the input is negated before being converted to an
73
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
74
| is simply rounded to an integer, with the inexact exception raised if the
75
| input cannot be represented exactly as an integer.  However, if the fixed-
76
| point input is too large, the invalid exception is raised and the largest
77
| positive or negative integer is returned.
78
*----------------------------------------------------------------------------*/
79

    
80
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81
{
82
    int8 roundingMode;
83
    flag roundNearestEven;
84
    int8 roundIncrement, roundBits;
85
    int32 z;
86

    
87
    roundingMode = STATUS(float_rounding_mode);
88
    roundNearestEven = ( roundingMode == float_round_nearest_even );
89
    roundIncrement = 0x40;
90
    if ( ! roundNearestEven ) {
91
        if ( roundingMode == float_round_to_zero ) {
92
            roundIncrement = 0;
93
        }
94
        else {
95
            roundIncrement = 0x7F;
96
            if ( zSign ) {
97
                if ( roundingMode == float_round_up ) roundIncrement = 0;
98
            }
99
            else {
100
                if ( roundingMode == float_round_down ) roundIncrement = 0;
101
            }
102
        }
103
    }
104
    roundBits = absZ & 0x7F;
105
    absZ = ( absZ + roundIncrement )>>7;
106
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107
    z = absZ;
108
    if ( zSign ) z = - z;
109
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110
        float_raise( float_flag_invalid STATUS_VAR);
111
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112
    }
113
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114
    return z;
115

    
116
}
117

    
118
/*----------------------------------------------------------------------------
119
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120
| `absZ1', with binary point between bits 63 and 64 (between the input words),
121
| and returns the properly rounded 64-bit integer corresponding to the input.
122
| If `zSign' is 1, the input is negated before being converted to an integer.
123
| Ordinarily, the fixed-point input is simply rounded to an integer, with
124
| the inexact exception raised if the input cannot be represented exactly as
125
| an integer.  However, if the fixed-point input is too large, the invalid
126
| exception is raised and the largest positive or negative integer is
127
| returned.
128
*----------------------------------------------------------------------------*/
129

    
130
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131
{
132
    int8 roundingMode;
133
    flag roundNearestEven, increment;
134
    int64 z;
135

    
136
    roundingMode = STATUS(float_rounding_mode);
137
    roundNearestEven = ( roundingMode == float_round_nearest_even );
138
    increment = ( (sbits64) absZ1 < 0 );
139
    if ( ! roundNearestEven ) {
140
        if ( roundingMode == float_round_to_zero ) {
141
            increment = 0;
142
        }
143
        else {
144
            if ( zSign ) {
145
                increment = ( roundingMode == float_round_down ) && absZ1;
146
            }
147
            else {
148
                increment = ( roundingMode == float_round_up ) && absZ1;
149
            }
150
        }
151
    }
152
    if ( increment ) {
153
        ++absZ0;
154
        if ( absZ0 == 0 ) goto overflow;
155
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156
    }
157
    z = absZ0;
158
    if ( zSign ) z = - z;
159
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
160
 overflow:
161
        float_raise( float_flag_invalid STATUS_VAR);
162
        return
163
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
164
            : LIT64( 0x7FFFFFFFFFFFFFFF );
165
    }
166
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167
    return z;
168

    
169
}
170

    
171
/*----------------------------------------------------------------------------
172
| Returns the fraction bits of the single-precision floating-point value `a'.
173
*----------------------------------------------------------------------------*/
174

    
175
INLINE bits32 extractFloat32Frac( float32 a )
176
{
177

    
178
    return a & 0x007FFFFF;
179

    
180
}
181

    
182
/*----------------------------------------------------------------------------
183
| Returns the exponent bits of the single-precision floating-point value `a'.
184
*----------------------------------------------------------------------------*/
185

    
186
INLINE int16 extractFloat32Exp( float32 a )
187
{
188

    
189
    return ( a>>23 ) & 0xFF;
190

    
191
}
192

    
193
/*----------------------------------------------------------------------------
194
| Returns the sign bit of the single-precision floating-point value `a'.
195
*----------------------------------------------------------------------------*/
196

    
197
INLINE flag extractFloat32Sign( float32 a )
198
{
199

    
200
    return a>>31;
201

    
202
}
203

    
204
/*----------------------------------------------------------------------------
205
| Normalizes the subnormal single-precision floating-point value represented
206
| by the denormalized significand `aSig'.  The normalized exponent and
207
| significand are stored at the locations pointed to by `zExpPtr' and
208
| `zSigPtr', respectively.
209
*----------------------------------------------------------------------------*/
210

    
211
static void
212
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213
{
214
    int8 shiftCount;
215

    
216
    shiftCount = countLeadingZeros32( aSig ) - 8;
217
    *zSigPtr = aSig<<shiftCount;
218
    *zExpPtr = 1 - shiftCount;
219

    
220
}
221

    
222
/*----------------------------------------------------------------------------
223
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224
| single-precision floating-point value, returning the result.  After being
225
| shifted into the proper positions, the three fields are simply added
226
| together to form the result.  This means that any integer portion of `zSig'
227
| will be added into the exponent.  Since a properly normalized significand
228
| will have an integer portion equal to 1, the `zExp' input should be 1 less
229
| than the desired result exponent whenever `zSig' is a complete, normalized
230
| significand.
231
*----------------------------------------------------------------------------*/
232

    
233
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234
{
235

    
236
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237

    
238
}
239

    
240
/*----------------------------------------------------------------------------
241
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242
| and significand `zSig', and returns the proper single-precision floating-
243
| point value corresponding to the abstract input.  Ordinarily, the abstract
244
| value is simply rounded and packed into the single-precision format, with
245
| the inexact exception raised if the abstract input cannot be represented
246
| exactly.  However, if the abstract value is too large, the overflow and
247
| inexact exceptions are raised and an infinity or maximal finite value is
248
| returned.  If the abstract value is too small, the input value is rounded to
249
| a subnormal number, and the underflow and inexact exceptions are raised if
250
| the abstract input cannot be represented exactly as a subnormal single-
251
| precision floating-point number.
252
|     The input significand `zSig' has its binary point between bits 30
253
| and 29, which is 7 bits to the left of the usual location.  This shifted
254
| significand must be normalized or smaller.  If `zSig' is not normalized,
255
| `zExp' must be 0; in that case, the result returned is a subnormal number,
256
| and it must not require rounding.  In the usual case that `zSig' is
257
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258
| The handling of underflow and overflow follows the IEC/IEEE Standard for
259
| Binary Floating-Point Arithmetic.
260
*----------------------------------------------------------------------------*/
261

    
262
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263
{
264
    int8 roundingMode;
265
    flag roundNearestEven;
266
    int8 roundIncrement, roundBits;
267
    flag isTiny;
268

    
269
    roundingMode = STATUS(float_rounding_mode);
270
    roundNearestEven = ( roundingMode == float_round_nearest_even );
271
    roundIncrement = 0x40;
272
    if ( ! roundNearestEven ) {
273
        if ( roundingMode == float_round_to_zero ) {
274
            roundIncrement = 0;
275
        }
276
        else {
277
            roundIncrement = 0x7F;
278
            if ( zSign ) {
279
                if ( roundingMode == float_round_up ) roundIncrement = 0;
280
            }
281
            else {
282
                if ( roundingMode == float_round_down ) roundIncrement = 0;
283
            }
284
        }
285
    }
286
    roundBits = zSig & 0x7F;
287
    if ( 0xFD <= (bits16) zExp ) {
288
        if (    ( 0xFD < zExp )
289
             || (    ( zExp == 0xFD )
290
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291
           ) {
292
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
293
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294
        }
295
        if ( zExp < 0 ) {
296
            isTiny =
297
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298
                || ( zExp < -1 )
299
                || ( zSig + roundIncrement < 0x80000000 );
300
            shift32RightJamming( zSig, - zExp, &zSig );
301
            zExp = 0;
302
            roundBits = zSig & 0x7F;
303
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304
        }
305
    }
306
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
307
    zSig = ( zSig + roundIncrement )>>7;
308
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309
    if ( zSig == 0 ) zExp = 0;
310
    return packFloat32( zSign, zExp, zSig );
311

    
312
}
313

    
314
/*----------------------------------------------------------------------------
315
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316
| and significand `zSig', and returns the proper single-precision floating-
317
| point value corresponding to the abstract input.  This routine is just like
318
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320
| floating-point exponent.
321
*----------------------------------------------------------------------------*/
322

    
323
static float32
324
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325
{
326
    int8 shiftCount;
327

    
328
    shiftCount = countLeadingZeros32( zSig ) - 1;
329
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330

    
331
}
332

    
333
/*----------------------------------------------------------------------------
334
| Returns the fraction bits of the double-precision floating-point value `a'.
335
*----------------------------------------------------------------------------*/
336

    
337
INLINE bits64 extractFloat64Frac( float64 a )
338
{
339

    
340
    return a & LIT64( 0x000FFFFFFFFFFFFF );
341

    
342
}
343

    
344
/*----------------------------------------------------------------------------
345
| Returns the exponent bits of the double-precision floating-point value `a'.
346
*----------------------------------------------------------------------------*/
347

    
348
INLINE int16 extractFloat64Exp( float64 a )
349
{
350

    
351
    return ( a>>52 ) & 0x7FF;
352

    
353
}
354

    
355
/*----------------------------------------------------------------------------
356
| Returns the sign bit of the double-precision floating-point value `a'.
357
*----------------------------------------------------------------------------*/
358

    
359
INLINE flag extractFloat64Sign( float64 a )
360
{
361

    
362
    return a>>63;
363

    
364
}
365

    
366
/*----------------------------------------------------------------------------
367
| Normalizes the subnormal double-precision floating-point value represented
368
| by the denormalized significand `aSig'.  The normalized exponent and
369
| significand are stored at the locations pointed to by `zExpPtr' and
370
| `zSigPtr', respectively.
371
*----------------------------------------------------------------------------*/
372

    
373
static void
374
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
375
{
376
    int8 shiftCount;
377

    
378
    shiftCount = countLeadingZeros64( aSig ) - 11;
379
    *zSigPtr = aSig<<shiftCount;
380
    *zExpPtr = 1 - shiftCount;
381

    
382
}
383

    
384
/*----------------------------------------------------------------------------
385
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386
| double-precision floating-point value, returning the result.  After being
387
| shifted into the proper positions, the three fields are simply added
388
| together to form the result.  This means that any integer portion of `zSig'
389
| will be added into the exponent.  Since a properly normalized significand
390
| will have an integer portion equal to 1, the `zExp' input should be 1 less
391
| than the desired result exponent whenever `zSig' is a complete, normalized
392
| significand.
393
*----------------------------------------------------------------------------*/
394

    
395
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396
{
397

    
398
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399

    
400
}
401

    
402
/*----------------------------------------------------------------------------
403
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404
| and significand `zSig', and returns the proper double-precision floating-
405
| point value corresponding to the abstract input.  Ordinarily, the abstract
406
| value is simply rounded and packed into the double-precision format, with
407
| the inexact exception raised if the abstract input cannot be represented
408
| exactly.  However, if the abstract value is too large, the overflow and
409
| inexact exceptions are raised and an infinity or maximal finite value is
410
| returned.  If the abstract value is too small, the input value is rounded
411
| to a subnormal number, and the underflow and inexact exceptions are raised
412
| if the abstract input cannot be represented exactly as a subnormal double-
413
| precision floating-point number.
414
|     The input significand `zSig' has its binary point between bits 62
415
| and 61, which is 10 bits to the left of the usual location.  This shifted
416
| significand must be normalized or smaller.  If `zSig' is not normalized,
417
| `zExp' must be 0; in that case, the result returned is a subnormal number,
418
| and it must not require rounding.  In the usual case that `zSig' is
419
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420
| The handling of underflow and overflow follows the IEC/IEEE Standard for
421
| Binary Floating-Point Arithmetic.
422
*----------------------------------------------------------------------------*/
423

    
424
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425
{
426
    int8 roundingMode;
427
    flag roundNearestEven;
428
    int16 roundIncrement, roundBits;
429
    flag isTiny;
430

    
431
    roundingMode = STATUS(float_rounding_mode);
432
    roundNearestEven = ( roundingMode == float_round_nearest_even );
433
    roundIncrement = 0x200;
434
    if ( ! roundNearestEven ) {
435
        if ( roundingMode == float_round_to_zero ) {
436
            roundIncrement = 0;
437
        }
438
        else {
439
            roundIncrement = 0x3FF;
440
            if ( zSign ) {
441
                if ( roundingMode == float_round_up ) roundIncrement = 0;
442
            }
443
            else {
444
                if ( roundingMode == float_round_down ) roundIncrement = 0;
445
            }
446
        }
447
    }
448
    roundBits = zSig & 0x3FF;
449
    if ( 0x7FD <= (bits16) zExp ) {
450
        if (    ( 0x7FD < zExp )
451
             || (    ( zExp == 0x7FD )
452
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453
           ) {
454
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
455
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456
        }
457
        if ( zExp < 0 ) {
458
            isTiny =
459
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460
                || ( zExp < -1 )
461
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462
            shift64RightJamming( zSig, - zExp, &zSig );
463
            zExp = 0;
464
            roundBits = zSig & 0x3FF;
465
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466
        }
467
    }
468
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
469
    zSig = ( zSig + roundIncrement )>>10;
470
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471
    if ( zSig == 0 ) zExp = 0;
472
    return packFloat64( zSign, zExp, zSig );
473

    
474
}
475

    
476
/*----------------------------------------------------------------------------
477
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478
| and significand `zSig', and returns the proper double-precision floating-
479
| point value corresponding to the abstract input.  This routine is just like
480
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482
| floating-point exponent.
483
*----------------------------------------------------------------------------*/
484

    
485
static float64
486
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487
{
488
    int8 shiftCount;
489

    
490
    shiftCount = countLeadingZeros64( zSig ) - 1;
491
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492

    
493
}
494

    
495
#ifdef FLOATX80
496

    
497
/*----------------------------------------------------------------------------
498
| Returns the fraction bits of the extended double-precision floating-point
499
| value `a'.
500
*----------------------------------------------------------------------------*/
501

    
502
INLINE bits64 extractFloatx80Frac( floatx80 a )
503
{
504

    
505
    return a.low;
506

    
507
}
508

    
509
/*----------------------------------------------------------------------------
510
| Returns the exponent bits of the extended double-precision floating-point
511
| value `a'.
512
*----------------------------------------------------------------------------*/
513

    
514
INLINE int32 extractFloatx80Exp( floatx80 a )
515
{
516

    
517
    return a.high & 0x7FFF;
518

    
519
}
520

    
521
/*----------------------------------------------------------------------------
522
| Returns the sign bit of the extended double-precision floating-point value
523
| `a'.
524
*----------------------------------------------------------------------------*/
525

    
526
INLINE flag extractFloatx80Sign( floatx80 a )
527
{
528

    
529
    return a.high>>15;
530

    
531
}
532

    
533
/*----------------------------------------------------------------------------
534
| Normalizes the subnormal extended double-precision floating-point value
535
| represented by the denormalized significand `aSig'.  The normalized exponent
536
| and significand are stored at the locations pointed to by `zExpPtr' and
537
| `zSigPtr', respectively.
538
*----------------------------------------------------------------------------*/
539

    
540
static void
541
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
542
{
543
    int8 shiftCount;
544

    
545
    shiftCount = countLeadingZeros64( aSig );
546
    *zSigPtr = aSig<<shiftCount;
547
    *zExpPtr = 1 - shiftCount;
548

    
549
}
550

    
551
/*----------------------------------------------------------------------------
552
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553
| extended double-precision floating-point value, returning the result.
554
*----------------------------------------------------------------------------*/
555

    
556
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557
{
558
    floatx80 z;
559

    
560
    z.low = zSig;
561
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562
    return z;
563

    
564
}
565

    
566
/*----------------------------------------------------------------------------
567
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569
| and returns the proper extended double-precision floating-point value
570
| corresponding to the abstract input.  Ordinarily, the abstract value is
571
| rounded and packed into the extended double-precision format, with the
572
| inexact exception raised if the abstract input cannot be represented
573
| exactly.  However, if the abstract value is too large, the overflow and
574
| inexact exceptions are raised and an infinity or maximal finite value is
575
| returned.  If the abstract value is too small, the input value is rounded to
576
| a subnormal number, and the underflow and inexact exceptions are raised if
577
| the abstract input cannot be represented exactly as a subnormal extended
578
| double-precision floating-point number.
579
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
580
| number of bits as single or double precision, respectively.  Otherwise, the
581
| result is rounded to the full precision of the extended double-precision
582
| format.
583
|     The input significand must be normalized or smaller.  If the input
584
| significand is not normalized, `zExp' must be 0; in that case, the result
585
| returned is a subnormal number, and it must not require rounding.  The
586
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587
| Floating-Point Arithmetic.
588
*----------------------------------------------------------------------------*/
589

    
590
static floatx80
591
 roundAndPackFloatx80(
592
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593
 STATUS_PARAM)
594
{
595
    int8 roundingMode;
596
    flag roundNearestEven, increment, isTiny;
597
    int64 roundIncrement, roundMask, roundBits;
598

    
599
    roundingMode = STATUS(float_rounding_mode);
600
    roundNearestEven = ( roundingMode == float_round_nearest_even );
601
    if ( roundingPrecision == 80 ) goto precision80;
602
    if ( roundingPrecision == 64 ) {
603
        roundIncrement = LIT64( 0x0000000000000400 );
604
        roundMask = LIT64( 0x00000000000007FF );
605
    }
606
    else if ( roundingPrecision == 32 ) {
607
        roundIncrement = LIT64( 0x0000008000000000 );
608
        roundMask = LIT64( 0x000000FFFFFFFFFF );
609
    }
610
    else {
611
        goto precision80;
612
    }
613
    zSig0 |= ( zSig1 != 0 );
614
    if ( ! roundNearestEven ) {
615
        if ( roundingMode == float_round_to_zero ) {
616
            roundIncrement = 0;
617
        }
618
        else {
619
            roundIncrement = roundMask;
620
            if ( zSign ) {
621
                if ( roundingMode == float_round_up ) roundIncrement = 0;
622
            }
623
            else {
624
                if ( roundingMode == float_round_down ) roundIncrement = 0;
625
            }
626
        }
627
    }
628
    roundBits = zSig0 & roundMask;
629
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630
        if (    ( 0x7FFE < zExp )
631
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632
           ) {
633
            goto overflow;
634
        }
635
        if ( zExp <= 0 ) {
636
            isTiny =
637
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638
                || ( zExp < 0 )
639
                || ( zSig0 <= zSig0 + roundIncrement );
640
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641
            zExp = 0;
642
            roundBits = zSig0 & roundMask;
643
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645
            zSig0 += roundIncrement;
646
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
647
            roundIncrement = roundMask + 1;
648
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649
                roundMask |= roundIncrement;
650
            }
651
            zSig0 &= ~ roundMask;
652
            return packFloatx80( zSign, zExp, zSig0 );
653
        }
654
    }
655
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
656
    zSig0 += roundIncrement;
657
    if ( zSig0 < roundIncrement ) {
658
        ++zExp;
659
        zSig0 = LIT64( 0x8000000000000000 );
660
    }
661
    roundIncrement = roundMask + 1;
662
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663
        roundMask |= roundIncrement;
664
    }
665
    zSig0 &= ~ roundMask;
666
    if ( zSig0 == 0 ) zExp = 0;
667
    return packFloatx80( zSign, zExp, zSig0 );
668
 precision80:
669
    increment = ( (sbits64) zSig1 < 0 );
670
    if ( ! roundNearestEven ) {
671
        if ( roundingMode == float_round_to_zero ) {
672
            increment = 0;
673
        }
674
        else {
675
            if ( zSign ) {
676
                increment = ( roundingMode == float_round_down ) && zSig1;
677
            }
678
            else {
679
                increment = ( roundingMode == float_round_up ) && zSig1;
680
            }
681
        }
682
    }
683
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684
        if (    ( 0x7FFE < zExp )
685
             || (    ( zExp == 0x7FFE )
686
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687
                  && increment
688
                )
689
           ) {
690
            roundMask = 0;
691
 overflow:
692
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
693
            if (    ( roundingMode == float_round_to_zero )
694
                 || ( zSign && ( roundingMode == float_round_up ) )
695
                 || ( ! zSign && ( roundingMode == float_round_down ) )
696
               ) {
697
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698
            }
699
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700
        }
701
        if ( zExp <= 0 ) {
702
            isTiny =
703
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704
                || ( zExp < 0 )
705
                || ! increment
706
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708
            zExp = 0;
709
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
711
            if ( roundNearestEven ) {
712
                increment = ( (sbits64) zSig1 < 0 );
713
            }
714
            else {
715
                if ( zSign ) {
716
                    increment = ( roundingMode == float_round_down ) && zSig1;
717
                }
718
                else {
719
                    increment = ( roundingMode == float_round_up ) && zSig1;
720
                }
721
            }
722
            if ( increment ) {
723
                ++zSig0;
724
                zSig0 &=
725
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
727
            }
728
            return packFloatx80( zSign, zExp, zSig0 );
729
        }
730
    }
731
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
732
    if ( increment ) {
733
        ++zSig0;
734
        if ( zSig0 == 0 ) {
735
            ++zExp;
736
            zSig0 = LIT64( 0x8000000000000000 );
737
        }
738
        else {
739
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740
        }
741
    }
742
    else {
743
        if ( zSig0 == 0 ) zExp = 0;
744
    }
745
    return packFloatx80( zSign, zExp, zSig0 );
746

    
747
}
748

    
749
/*----------------------------------------------------------------------------
750
| Takes an abstract floating-point value having sign `zSign', exponent
751
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752
| and returns the proper extended double-precision floating-point value
753
| corresponding to the abstract input.  This routine is just like
754
| `roundAndPackFloatx80' except that the input significand does not have to be
755
| normalized.
756
*----------------------------------------------------------------------------*/
757

    
758
static floatx80
759
 normalizeRoundAndPackFloatx80(
760
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761
 STATUS_PARAM)
762
{
763
    int8 shiftCount;
764

    
765
    if ( zSig0 == 0 ) {
766
        zSig0 = zSig1;
767
        zSig1 = 0;
768
        zExp -= 64;
769
    }
770
    shiftCount = countLeadingZeros64( zSig0 );
771
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772
    zExp -= shiftCount;
773
    return
774
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775

    
776
}
777

    
778
#endif
779

    
780
#ifdef FLOAT128
781

    
782
/*----------------------------------------------------------------------------
783
| Returns the least-significant 64 fraction bits of the quadruple-precision
784
| floating-point value `a'.
785
*----------------------------------------------------------------------------*/
786

    
787
INLINE bits64 extractFloat128Frac1( float128 a )
788
{
789

    
790
    return a.low;
791

    
792
}
793

    
794
/*----------------------------------------------------------------------------
795
| Returns the most-significant 48 fraction bits of the quadruple-precision
796
| floating-point value `a'.
797
*----------------------------------------------------------------------------*/
798

    
799
INLINE bits64 extractFloat128Frac0( float128 a )
800
{
801

    
802
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803

    
804
}
805

    
806
/*----------------------------------------------------------------------------
807
| Returns the exponent bits of the quadruple-precision floating-point value
808
| `a'.
809
*----------------------------------------------------------------------------*/
810

    
811
INLINE int32 extractFloat128Exp( float128 a )
812
{
813

    
814
    return ( a.high>>48 ) & 0x7FFF;
815

    
816
}
817

    
818
/*----------------------------------------------------------------------------
819
| Returns the sign bit of the quadruple-precision floating-point value `a'.
820
*----------------------------------------------------------------------------*/
821

    
822
INLINE flag extractFloat128Sign( float128 a )
823
{
824

    
825
    return a.high>>63;
826

    
827
}
828

    
829
/*----------------------------------------------------------------------------
830
| Normalizes the subnormal quadruple-precision floating-point value
831
| represented by the denormalized significand formed by the concatenation of
832
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
833
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
834
| significand are stored at the location pointed to by `zSig0Ptr', and the
835
| least significant 64 bits of the normalized significand are stored at the
836
| location pointed to by `zSig1Ptr'.
837
*----------------------------------------------------------------------------*/
838

    
839
static void
840
 normalizeFloat128Subnormal(
841
     bits64 aSig0,
842
     bits64 aSig1,
843
     int32 *zExpPtr,
844
     bits64 *zSig0Ptr,
845
     bits64 *zSig1Ptr
846
 )
847
{
848
    int8 shiftCount;
849

    
850
    if ( aSig0 == 0 ) {
851
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
852
        if ( shiftCount < 0 ) {
853
            *zSig0Ptr = aSig1>>( - shiftCount );
854
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
855
        }
856
        else {
857
            *zSig0Ptr = aSig1<<shiftCount;
858
            *zSig1Ptr = 0;
859
        }
860
        *zExpPtr = - shiftCount - 63;
861
    }
862
    else {
863
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
864
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865
        *zExpPtr = 1 - shiftCount;
866
    }
867

    
868
}
869

    
870
/*----------------------------------------------------------------------------
871
| Packs the sign `zSign', the exponent `zExp', and the significand formed
872
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873
| floating-point value, returning the result.  After being shifted into the
874
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875
| added together to form the most significant 32 bits of the result.  This
876
| means that any integer portion of `zSig0' will be added into the exponent.
877
| Since a properly normalized significand will have an integer portion equal
878
| to 1, the `zExp' input should be 1 less than the desired result exponent
879
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880
| significand.
881
*----------------------------------------------------------------------------*/
882

    
883
INLINE float128
884
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885
{
886
    float128 z;
887

    
888
    z.low = zSig1;
889
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890
    return z;
891

    
892
}
893

    
894
/*----------------------------------------------------------------------------
895
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896
| and extended significand formed by the concatenation of `zSig0', `zSig1',
897
| and `zSig2', and returns the proper quadruple-precision floating-point value
898
| corresponding to the abstract input.  Ordinarily, the abstract value is
899
| simply rounded and packed into the quadruple-precision format, with the
900
| inexact exception raised if the abstract input cannot be represented
901
| exactly.  However, if the abstract value is too large, the overflow and
902
| inexact exceptions are raised and an infinity or maximal finite value is
903
| returned.  If the abstract value is too small, the input value is rounded to
904
| a subnormal number, and the underflow and inexact exceptions are raised if
905
| the abstract input cannot be represented exactly as a subnormal quadruple-
906
| precision floating-point number.
907
|     The input significand must be normalized or smaller.  If the input
908
| significand is not normalized, `zExp' must be 0; in that case, the result
909
| returned is a subnormal number, and it must not require rounding.  In the
910
| usual case that the input significand is normalized, `zExp' must be 1 less
911
| than the ``true'' floating-point exponent.  The handling of underflow and
912
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913
*----------------------------------------------------------------------------*/
914

    
915
static float128
916
 roundAndPackFloat128(
917
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918
{
919
    int8 roundingMode;
920
    flag roundNearestEven, increment, isTiny;
921

    
922
    roundingMode = STATUS(float_rounding_mode);
923
    roundNearestEven = ( roundingMode == float_round_nearest_even );
924
    increment = ( (sbits64) zSig2 < 0 );
925
    if ( ! roundNearestEven ) {
926
        if ( roundingMode == float_round_to_zero ) {
927
            increment = 0;
928
        }
929
        else {
930
            if ( zSign ) {
931
                increment = ( roundingMode == float_round_down ) && zSig2;
932
            }
933
            else {
934
                increment = ( roundingMode == float_round_up ) && zSig2;
935
            }
936
        }
937
    }
938
    if ( 0x7FFD <= (bits32) zExp ) {
939
        if (    ( 0x7FFD < zExp )
940
             || (    ( zExp == 0x7FFD )
941
                  && eq128(
942
                         LIT64( 0x0001FFFFFFFFFFFF ),
943
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
944
                         zSig0,
945
                         zSig1
946
                     )
947
                  && increment
948
                )
949
           ) {
950
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
951
            if (    ( roundingMode == float_round_to_zero )
952
                 || ( zSign && ( roundingMode == float_round_up ) )
953
                 || ( ! zSign && ( roundingMode == float_round_down ) )
954
               ) {
955
                return
956
                    packFloat128(
957
                        zSign,
958
                        0x7FFE,
959
                        LIT64( 0x0000FFFFFFFFFFFF ),
960
                        LIT64( 0xFFFFFFFFFFFFFFFF )
961
                    );
962
            }
963
            return packFloat128( zSign, 0x7FFF, 0, 0 );
964
        }
965
        if ( zExp < 0 ) {
966
            isTiny =
967
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968
                || ( zExp < -1 )
969
                || ! increment
970
                || lt128(
971
                       zSig0,
972
                       zSig1,
973
                       LIT64( 0x0001FFFFFFFFFFFF ),
974
                       LIT64( 0xFFFFFFFFFFFFFFFF )
975
                   );
976
            shift128ExtraRightJamming(
977
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978
            zExp = 0;
979
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980
            if ( roundNearestEven ) {
981
                increment = ( (sbits64) zSig2 < 0 );
982
            }
983
            else {
984
                if ( zSign ) {
985
                    increment = ( roundingMode == float_round_down ) && zSig2;
986
                }
987
                else {
988
                    increment = ( roundingMode == float_round_up ) && zSig2;
989
                }
990
            }
991
        }
992
    }
993
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
994
    if ( increment ) {
995
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997
    }
998
    else {
999
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000
    }
1001
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1002

    
1003
}
1004

    
1005
/*----------------------------------------------------------------------------
1006
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008
| returns the proper quadruple-precision floating-point value corresponding
1009
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1010
| except that the input significand has fewer bits and does not have to be
1011
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1012
| point exponent.
1013
*----------------------------------------------------------------------------*/
1014

    
1015
static float128
1016
 normalizeRoundAndPackFloat128(
1017
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018
{
1019
    int8 shiftCount;
1020
    bits64 zSig2;
1021

    
1022
    if ( zSig0 == 0 ) {
1023
        zSig0 = zSig1;
1024
        zSig1 = 0;
1025
        zExp -= 64;
1026
    }
1027
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028
    if ( 0 <= shiftCount ) {
1029
        zSig2 = 0;
1030
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031
    }
1032
    else {
1033
        shift128ExtraRightJamming(
1034
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035
    }
1036
    zExp -= shiftCount;
1037
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038

    
1039
}
1040

    
1041
#endif
1042

    
1043
/*----------------------------------------------------------------------------
1044
| Returns the result of converting the 32-bit two's complement integer `a'
1045
| to the single-precision floating-point format.  The conversion is performed
1046
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047
*----------------------------------------------------------------------------*/
1048

    
1049
float32 int32_to_float32( int32 a STATUS_PARAM )
1050
{
1051
    flag zSign;
1052

    
1053
    if ( a == 0 ) return 0;
1054
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055
    zSign = ( a < 0 );
1056
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057

    
1058
}
1059

    
1060
/*----------------------------------------------------------------------------
1061
| Returns the result of converting the 32-bit two's complement integer `a'
1062
| to the double-precision floating-point format.  The conversion is performed
1063
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064
*----------------------------------------------------------------------------*/
1065

    
1066
float64 int32_to_float64( int32 a STATUS_PARAM )
1067
{
1068
    flag zSign;
1069
    uint32 absA;
1070
    int8 shiftCount;
1071
    bits64 zSig;
1072

    
1073
    if ( a == 0 ) return 0;
1074
    zSign = ( a < 0 );
1075
    absA = zSign ? - a : a;
1076
    shiftCount = countLeadingZeros32( absA ) + 21;
1077
    zSig = absA;
1078
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079

    
1080
}
1081

    
1082
#ifdef FLOATX80
1083

    
1084
/*----------------------------------------------------------------------------
1085
| Returns the result of converting the 32-bit two's complement integer `a'
1086
| to the extended double-precision floating-point format.  The conversion
1087
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088
| Arithmetic.
1089
*----------------------------------------------------------------------------*/
1090

    
1091
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092
{
1093
    flag zSign;
1094
    uint32 absA;
1095
    int8 shiftCount;
1096
    bits64 zSig;
1097

    
1098
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099
    zSign = ( a < 0 );
1100
    absA = zSign ? - a : a;
1101
    shiftCount = countLeadingZeros32( absA ) + 32;
1102
    zSig = absA;
1103
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104

    
1105
}
1106

    
1107
#endif
1108

    
1109
#ifdef FLOAT128
1110

    
1111
/*----------------------------------------------------------------------------
1112
| Returns the result of converting the 32-bit two's complement integer `a' to
1113
| the quadruple-precision floating-point format.  The conversion is performed
1114
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115
*----------------------------------------------------------------------------*/
1116

    
1117
float128 int32_to_float128( int32 a STATUS_PARAM )
1118
{
1119
    flag zSign;
1120
    uint32 absA;
1121
    int8 shiftCount;
1122
    bits64 zSig0;
1123

    
1124
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125
    zSign = ( a < 0 );
1126
    absA = zSign ? - a : a;
1127
    shiftCount = countLeadingZeros32( absA ) + 17;
1128
    zSig0 = absA;
1129
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130

    
1131
}
1132

    
1133
#endif
1134

    
1135
/*----------------------------------------------------------------------------
1136
| Returns the result of converting the 64-bit two's complement integer `a'
1137
| to the single-precision floating-point format.  The conversion is performed
1138
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139
*----------------------------------------------------------------------------*/
1140

    
1141
float32 int64_to_float32( int64 a STATUS_PARAM )
1142
{
1143
    flag zSign;
1144
    uint64 absA;
1145
    int8 shiftCount;
1146

    
1147
    if ( a == 0 ) return 0;
1148
    zSign = ( a < 0 );
1149
    absA = zSign ? - a : a;
1150
    shiftCount = countLeadingZeros64( absA ) - 40;
1151
    if ( 0 <= shiftCount ) {
1152
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153
    }
1154
    else {
1155
        shiftCount += 7;
1156
        if ( shiftCount < 0 ) {
1157
            shift64RightJamming( absA, - shiftCount, &absA );
1158
        }
1159
        else {
1160
            absA <<= shiftCount;
1161
        }
1162
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163
    }
1164

    
1165
}
1166

    
1167
/*----------------------------------------------------------------------------
1168
| Returns the result of converting the 64-bit two's complement integer `a'
1169
| to the double-precision floating-point format.  The conversion is performed
1170
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171
*----------------------------------------------------------------------------*/
1172

    
1173
float64 int64_to_float64( int64 a STATUS_PARAM )
1174
{
1175
    flag zSign;
1176

    
1177
    if ( a == 0 ) return 0;
1178
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179
        return packFloat64( 1, 0x43E, 0 );
1180
    }
1181
    zSign = ( a < 0 );
1182
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183

    
1184
}
1185

    
1186
#ifdef FLOATX80
1187

    
1188
/*----------------------------------------------------------------------------
1189
| Returns the result of converting the 64-bit two's complement integer `a'
1190
| to the extended double-precision floating-point format.  The conversion
1191
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192
| Arithmetic.
1193
*----------------------------------------------------------------------------*/
1194

    
1195
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196
{
1197
    flag zSign;
1198
    uint64 absA;
1199
    int8 shiftCount;
1200

    
1201
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202
    zSign = ( a < 0 );
1203
    absA = zSign ? - a : a;
1204
    shiftCount = countLeadingZeros64( absA );
1205
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206

    
1207
}
1208

    
1209
#endif
1210

    
1211
#ifdef FLOAT128
1212

    
1213
/*----------------------------------------------------------------------------
1214
| Returns the result of converting the 64-bit two's complement integer `a' to
1215
| the quadruple-precision floating-point format.  The conversion is performed
1216
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217
*----------------------------------------------------------------------------*/
1218

    
1219
float128 int64_to_float128( int64 a STATUS_PARAM )
1220
{
1221
    flag zSign;
1222
    uint64 absA;
1223
    int8 shiftCount;
1224
    int32 zExp;
1225
    bits64 zSig0, zSig1;
1226

    
1227
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228
    zSign = ( a < 0 );
1229
    absA = zSign ? - a : a;
1230
    shiftCount = countLeadingZeros64( absA ) + 49;
1231
    zExp = 0x406E - shiftCount;
1232
    if ( 64 <= shiftCount ) {
1233
        zSig1 = 0;
1234
        zSig0 = absA;
1235
        shiftCount -= 64;
1236
    }
1237
    else {
1238
        zSig1 = absA;
1239
        zSig0 = 0;
1240
    }
1241
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1243

    
1244
}
1245

    
1246
#endif
1247

    
1248
/*----------------------------------------------------------------------------
1249
| Returns the result of converting the single-precision floating-point value
1250
| `a' to the 32-bit two's complement integer format.  The conversion is
1251
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1252
| Arithmetic---which means in particular that the conversion is rounded
1253
| according to the current rounding mode.  If `a' is a NaN, the largest
1254
| positive integer is returned.  Otherwise, if the conversion overflows, the
1255
| largest integer with the same sign as `a' is returned.
1256
*----------------------------------------------------------------------------*/
1257

    
1258
int32 float32_to_int32( float32 a STATUS_PARAM )
1259
{
1260
    flag aSign;
1261
    int16 aExp, shiftCount;
1262
    bits32 aSig;
1263
    bits64 aSig64;
1264

    
1265
    aSig = extractFloat32Frac( a );
1266
    aExp = extractFloat32Exp( a );
1267
    aSign = extractFloat32Sign( a );
1268
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269
    if ( aExp ) aSig |= 0x00800000;
1270
    shiftCount = 0xAF - aExp;
1271
    aSig64 = aSig;
1272
    aSig64 <<= 32;
1273
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275

    
1276
}
1277

    
1278
/*----------------------------------------------------------------------------
1279
| Returns the result of converting the single-precision floating-point value
1280
| `a' to the 32-bit two's complement integer format.  The conversion is
1281
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282
| Arithmetic, except that the conversion is always rounded toward zero.
1283
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1284
| the conversion overflows, the largest integer with the same sign as `a' is
1285
| returned.
1286
*----------------------------------------------------------------------------*/
1287

    
1288
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289
{
1290
    flag aSign;
1291
    int16 aExp, shiftCount;
1292
    bits32 aSig;
1293
    int32 z;
1294

    
1295
    aSig = extractFloat32Frac( a );
1296
    aExp = extractFloat32Exp( a );
1297
    aSign = extractFloat32Sign( a );
1298
    shiftCount = aExp - 0x9E;
1299
    if ( 0 <= shiftCount ) {
1300
        if ( a != 0xCF000000 ) {
1301
            float_raise( float_flag_invalid STATUS_VAR);
1302
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303
        }
1304
        return (sbits32) 0x80000000;
1305
    }
1306
    else if ( aExp <= 0x7E ) {
1307
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1308
        return 0;
1309
    }
1310
    aSig = ( aSig | 0x00800000 )<<8;
1311
    z = aSig>>( - shiftCount );
1312
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313
        STATUS(float_exception_flags) |= float_flag_inexact;
1314
    }
1315
    if ( aSign ) z = - z;
1316
    return z;
1317

    
1318
}
1319

    
1320
/*----------------------------------------------------------------------------
1321
| Returns the result of converting the single-precision floating-point value
1322
| `a' to the 64-bit two's complement integer format.  The conversion is
1323
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1324
| Arithmetic---which means in particular that the conversion is rounded
1325
| according to the current rounding mode.  If `a' is a NaN, the largest
1326
| positive integer is returned.  Otherwise, if the conversion overflows, the
1327
| largest integer with the same sign as `a' is returned.
1328
*----------------------------------------------------------------------------*/
1329

    
1330
int64 float32_to_int64( float32 a STATUS_PARAM )
1331
{
1332
    flag aSign;
1333
    int16 aExp, shiftCount;
1334
    bits32 aSig;
1335
    bits64 aSig64, aSigExtra;
1336

    
1337
    aSig = extractFloat32Frac( a );
1338
    aExp = extractFloat32Exp( a );
1339
    aSign = extractFloat32Sign( a );
1340
    shiftCount = 0xBE - aExp;
1341
    if ( shiftCount < 0 ) {
1342
        float_raise( float_flag_invalid STATUS_VAR);
1343
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1344
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1345
        }
1346
        return (sbits64) LIT64( 0x8000000000000000 );
1347
    }
1348
    if ( aExp ) aSig |= 0x00800000;
1349
    aSig64 = aSig;
1350
    aSig64 <<= 40;
1351
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353

    
1354
}
1355

    
1356
/*----------------------------------------------------------------------------
1357
| Returns the result of converting the single-precision floating-point value
1358
| `a' to the 64-bit two's complement integer format.  The conversion is
1359
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360
| Arithmetic, except that the conversion is always rounded toward zero.  If
1361
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1362
| conversion overflows, the largest integer with the same sign as `a' is
1363
| returned.
1364
*----------------------------------------------------------------------------*/
1365

    
1366
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367
{
1368
    flag aSign;
1369
    int16 aExp, shiftCount;
1370
    bits32 aSig;
1371
    bits64 aSig64;
1372
    int64 z;
1373

    
1374
    aSig = extractFloat32Frac( a );
1375
    aExp = extractFloat32Exp( a );
1376
    aSign = extractFloat32Sign( a );
1377
    shiftCount = aExp - 0xBE;
1378
    if ( 0 <= shiftCount ) {
1379
        if ( a != 0xDF000000 ) {
1380
            float_raise( float_flag_invalid STATUS_VAR);
1381
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1382
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1383
            }
1384
        }
1385
        return (sbits64) LIT64( 0x8000000000000000 );
1386
    }
1387
    else if ( aExp <= 0x7E ) {
1388
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1389
        return 0;
1390
    }
1391
    aSig64 = aSig | 0x00800000;
1392
    aSig64 <<= 40;
1393
    z = aSig64>>( - shiftCount );
1394
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395
        STATUS(float_exception_flags) |= float_flag_inexact;
1396
    }
1397
    if ( aSign ) z = - z;
1398
    return z;
1399

    
1400
}
1401

    
1402
/*----------------------------------------------------------------------------
1403
| Returns the result of converting the single-precision floating-point value
1404
| `a' to the double-precision floating-point format.  The conversion is
1405
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1406
| Arithmetic.
1407
*----------------------------------------------------------------------------*/
1408

    
1409
float64 float32_to_float64( float32 a STATUS_PARAM )
1410
{
1411
    flag aSign;
1412
    int16 aExp;
1413
    bits32 aSig;
1414

    
1415
    aSig = extractFloat32Frac( a );
1416
    aExp = extractFloat32Exp( a );
1417
    aSign = extractFloat32Sign( a );
1418
    if ( aExp == 0xFF ) {
1419
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420
        return packFloat64( aSign, 0x7FF, 0 );
1421
    }
1422
    if ( aExp == 0 ) {
1423
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425
        --aExp;
1426
    }
1427
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428

    
1429
}
1430

    
1431
#ifdef FLOATX80
1432

    
1433
/*----------------------------------------------------------------------------
1434
| Returns the result of converting the single-precision floating-point value
1435
| `a' to the extended double-precision floating-point format.  The conversion
1436
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437
| Arithmetic.
1438
*----------------------------------------------------------------------------*/
1439

    
1440
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441
{
1442
    flag aSign;
1443
    int16 aExp;
1444
    bits32 aSig;
1445

    
1446
    aSig = extractFloat32Frac( a );
1447
    aExp = extractFloat32Exp( a );
1448
    aSign = extractFloat32Sign( a );
1449
    if ( aExp == 0xFF ) {
1450
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452
    }
1453
    if ( aExp == 0 ) {
1454
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456
    }
1457
    aSig |= 0x00800000;
1458
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459

    
1460
}
1461

    
1462
#endif
1463

    
1464
#ifdef FLOAT128
1465

    
1466
/*----------------------------------------------------------------------------
1467
| Returns the result of converting the single-precision floating-point value
1468
| `a' to the double-precision floating-point format.  The conversion is
1469
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1470
| Arithmetic.
1471
*----------------------------------------------------------------------------*/
1472

    
1473
float128 float32_to_float128( float32 a STATUS_PARAM )
1474
{
1475
    flag aSign;
1476
    int16 aExp;
1477
    bits32 aSig;
1478

    
1479
    aSig = extractFloat32Frac( a );
1480
    aExp = extractFloat32Exp( a );
1481
    aSign = extractFloat32Sign( a );
1482
    if ( aExp == 0xFF ) {
1483
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1485
    }
1486
    if ( aExp == 0 ) {
1487
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489
        --aExp;
1490
    }
1491
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492

    
1493
}
1494

    
1495
#endif
1496

    
1497
/*----------------------------------------------------------------------------
1498
| Rounds the single-precision floating-point value `a' to an integer, and
1499
| returns the result as a single-precision floating-point value.  The
1500
| operation is performed according to the IEC/IEEE Standard for Binary
1501
| Floating-Point Arithmetic.
1502
*----------------------------------------------------------------------------*/
1503

    
1504
float32 float32_round_to_int( float32 a STATUS_PARAM)
1505
{
1506
    flag aSign;
1507
    int16 aExp;
1508
    bits32 lastBitMask, roundBitsMask;
1509
    int8 roundingMode;
1510
    float32 z;
1511

    
1512
    aExp = extractFloat32Exp( a );
1513
    if ( 0x96 <= aExp ) {
1514
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515
            return propagateFloat32NaN( a, a STATUS_VAR );
1516
        }
1517
        return a;
1518
    }
1519
    if ( aExp <= 0x7E ) {
1520
        if ( (bits32) ( a<<1 ) == 0 ) return a;
1521
        STATUS(float_exception_flags) |= float_flag_inexact;
1522
        aSign = extractFloat32Sign( a );
1523
        switch ( STATUS(float_rounding_mode) ) {
1524
         case float_round_nearest_even:
1525
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526
                return packFloat32( aSign, 0x7F, 0 );
1527
            }
1528
            break;
1529
         case float_round_down:
1530
            return aSign ? 0xBF800000 : 0;
1531
         case float_round_up:
1532
            return aSign ? 0x80000000 : 0x3F800000;
1533
        }
1534
        return packFloat32( aSign, 0, 0 );
1535
    }
1536
    lastBitMask = 1;
1537
    lastBitMask <<= 0x96 - aExp;
1538
    roundBitsMask = lastBitMask - 1;
1539
    z = a;
1540
    roundingMode = STATUS(float_rounding_mode);
1541
    if ( roundingMode == float_round_nearest_even ) {
1542
        z += lastBitMask>>1;
1543
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544
    }
1545
    else if ( roundingMode != float_round_to_zero ) {
1546
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547
            z += roundBitsMask;
1548
        }
1549
    }
1550
    z &= ~ roundBitsMask;
1551
    if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1552
    return z;
1553

    
1554
}
1555

    
1556
/*----------------------------------------------------------------------------
1557
| Returns the result of adding the absolute values of the single-precision
1558
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1559
| before being returned.  `zSign' is ignored if the result is a NaN.
1560
| The addition is performed according to the IEC/IEEE Standard for Binary
1561
| Floating-Point Arithmetic.
1562
*----------------------------------------------------------------------------*/
1563

    
1564
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565
{
1566
    int16 aExp, bExp, zExp;
1567
    bits32 aSig, bSig, zSig;
1568
    int16 expDiff;
1569

    
1570
    aSig = extractFloat32Frac( a );
1571
    aExp = extractFloat32Exp( a );
1572
    bSig = extractFloat32Frac( b );
1573
    bExp = extractFloat32Exp( b );
1574
    expDiff = aExp - bExp;
1575
    aSig <<= 6;
1576
    bSig <<= 6;
1577
    if ( 0 < expDiff ) {
1578
        if ( aExp == 0xFF ) {
1579
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580
            return a;
1581
        }
1582
        if ( bExp == 0 ) {
1583
            --expDiff;
1584
        }
1585
        else {
1586
            bSig |= 0x20000000;
1587
        }
1588
        shift32RightJamming( bSig, expDiff, &bSig );
1589
        zExp = aExp;
1590
    }
1591
    else if ( expDiff < 0 ) {
1592
        if ( bExp == 0xFF ) {
1593
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594
            return packFloat32( zSign, 0xFF, 0 );
1595
        }
1596
        if ( aExp == 0 ) {
1597
            ++expDiff;
1598
        }
1599
        else {
1600
            aSig |= 0x20000000;
1601
        }
1602
        shift32RightJamming( aSig, - expDiff, &aSig );
1603
        zExp = bExp;
1604
    }
1605
    else {
1606
        if ( aExp == 0xFF ) {
1607
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608
            return a;
1609
        }
1610
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611
        zSig = 0x40000000 + aSig + bSig;
1612
        zExp = aExp;
1613
        goto roundAndPack;
1614
    }
1615
    aSig |= 0x20000000;
1616
    zSig = ( aSig + bSig )<<1;
1617
    --zExp;
1618
    if ( (sbits32) zSig < 0 ) {
1619
        zSig = aSig + bSig;
1620
        ++zExp;
1621
    }
1622
 roundAndPack:
1623
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624

    
1625
}
1626

    
1627
/*----------------------------------------------------------------------------
1628
| Returns the result of subtracting the absolute values of the single-
1629
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1630
| difference is negated before being returned.  `zSign' is ignored if the
1631
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1632
| Standard for Binary Floating-Point Arithmetic.
1633
*----------------------------------------------------------------------------*/
1634

    
1635
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636
{
1637
    int16 aExp, bExp, zExp;
1638
    bits32 aSig, bSig, zSig;
1639
    int16 expDiff;
1640

    
1641
    aSig = extractFloat32Frac( a );
1642
    aExp = extractFloat32Exp( a );
1643
    bSig = extractFloat32Frac( b );
1644
    bExp = extractFloat32Exp( b );
1645
    expDiff = aExp - bExp;
1646
    aSig <<= 7;
1647
    bSig <<= 7;
1648
    if ( 0 < expDiff ) goto aExpBigger;
1649
    if ( expDiff < 0 ) goto bExpBigger;
1650
    if ( aExp == 0xFF ) {
1651
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652
        float_raise( float_flag_invalid STATUS_VAR);
1653
        return float32_default_nan;
1654
    }
1655
    if ( aExp == 0 ) {
1656
        aExp = 1;
1657
        bExp = 1;
1658
    }
1659
    if ( bSig < aSig ) goto aBigger;
1660
    if ( aSig < bSig ) goto bBigger;
1661
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662
 bExpBigger:
1663
    if ( bExp == 0xFF ) {
1664
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1666
    }
1667
    if ( aExp == 0 ) {
1668
        ++expDiff;
1669
    }
1670
    else {
1671
        aSig |= 0x40000000;
1672
    }
1673
    shift32RightJamming( aSig, - expDiff, &aSig );
1674
    bSig |= 0x40000000;
1675
 bBigger:
1676
    zSig = bSig - aSig;
1677
    zExp = bExp;
1678
    zSign ^= 1;
1679
    goto normalizeRoundAndPack;
1680
 aExpBigger:
1681
    if ( aExp == 0xFF ) {
1682
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683
        return a;
1684
    }
1685
    if ( bExp == 0 ) {
1686
        --expDiff;
1687
    }
1688
    else {
1689
        bSig |= 0x40000000;
1690
    }
1691
    shift32RightJamming( bSig, expDiff, &bSig );
1692
    aSig |= 0x40000000;
1693
 aBigger:
1694
    zSig = aSig - bSig;
1695
    zExp = aExp;
1696
 normalizeRoundAndPack:
1697
    --zExp;
1698
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699

    
1700
}
1701

    
1702
/*----------------------------------------------------------------------------
1703
| Returns the result of adding the single-precision floating-point values `a'
1704
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1705
| Binary Floating-Point Arithmetic.
1706
*----------------------------------------------------------------------------*/
1707

    
1708
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709
{
1710
    flag aSign, bSign;
1711

    
1712
    aSign = extractFloat32Sign( a );
1713
    bSign = extractFloat32Sign( b );
1714
    if ( aSign == bSign ) {
1715
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716
    }
1717
    else {
1718
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719
    }
1720

    
1721
}
1722

    
1723
/*----------------------------------------------------------------------------
1724
| Returns the result of subtracting the single-precision floating-point values
1725
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1726
| for Binary Floating-Point Arithmetic.
1727
*----------------------------------------------------------------------------*/
1728

    
1729
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730
{
1731
    flag aSign, bSign;
1732

    
1733
    aSign = extractFloat32Sign( a );
1734
    bSign = extractFloat32Sign( b );
1735
    if ( aSign == bSign ) {
1736
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737
    }
1738
    else {
1739
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740
    }
1741

    
1742
}
1743

    
1744
/*----------------------------------------------------------------------------
1745
| Returns the result of multiplying the single-precision floating-point values
1746
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1747
| for Binary Floating-Point Arithmetic.
1748
*----------------------------------------------------------------------------*/
1749

    
1750
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751
{
1752
    flag aSign, bSign, zSign;
1753
    int16 aExp, bExp, zExp;
1754
    bits32 aSig, bSig;
1755
    bits64 zSig64;
1756
    bits32 zSig;
1757

    
1758
    aSig = extractFloat32Frac( a );
1759
    aExp = extractFloat32Exp( a );
1760
    aSign = extractFloat32Sign( a );
1761
    bSig = extractFloat32Frac( b );
1762
    bExp = extractFloat32Exp( b );
1763
    bSign = extractFloat32Sign( b );
1764
    zSign = aSign ^ bSign;
1765
    if ( aExp == 0xFF ) {
1766
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1767
            return propagateFloat32NaN( a, b STATUS_VAR );
1768
        }
1769
        if ( ( bExp | bSig ) == 0 ) {
1770
            float_raise( float_flag_invalid STATUS_VAR);
1771
            return float32_default_nan;
1772
        }
1773
        return packFloat32( zSign, 0xFF, 0 );
1774
    }
1775
    if ( bExp == 0xFF ) {
1776
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777
        if ( ( aExp | aSig ) == 0 ) {
1778
            float_raise( float_flag_invalid STATUS_VAR);
1779
            return float32_default_nan;
1780
        }
1781
        return packFloat32( zSign, 0xFF, 0 );
1782
    }
1783
    if ( aExp == 0 ) {
1784
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786
    }
1787
    if ( bExp == 0 ) {
1788
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790
    }
1791
    zExp = aExp + bExp - 0x7F;
1792
    aSig = ( aSig | 0x00800000 )<<7;
1793
    bSig = ( bSig | 0x00800000 )<<8;
1794
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795
    zSig = zSig64;
1796
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797
        zSig <<= 1;
1798
        --zExp;
1799
    }
1800
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801

    
1802
}
1803

    
1804
/*----------------------------------------------------------------------------
1805
| Returns the result of dividing the single-precision floating-point value `a'
1806
| by the corresponding value `b'.  The operation is performed according to the
1807
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808
*----------------------------------------------------------------------------*/
1809

    
1810
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811
{
1812
    flag aSign, bSign, zSign;
1813
    int16 aExp, bExp, zExp;
1814
    bits32 aSig, bSig, zSig;
1815

    
1816
    aSig = extractFloat32Frac( a );
1817
    aExp = extractFloat32Exp( a );
1818
    aSign = extractFloat32Sign( a );
1819
    bSig = extractFloat32Frac( b );
1820
    bExp = extractFloat32Exp( b );
1821
    bSign = extractFloat32Sign( b );
1822
    zSign = aSign ^ bSign;
1823
    if ( aExp == 0xFF ) {
1824
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825
        if ( bExp == 0xFF ) {
1826
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827
            float_raise( float_flag_invalid STATUS_VAR);
1828
            return float32_default_nan;
1829
        }
1830
        return packFloat32( zSign, 0xFF, 0 );
1831
    }
1832
    if ( bExp == 0xFF ) {
1833
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834
        return packFloat32( zSign, 0, 0 );
1835
    }
1836
    if ( bExp == 0 ) {
1837
        if ( bSig == 0 ) {
1838
            if ( ( aExp | aSig ) == 0 ) {
1839
                float_raise( float_flag_invalid STATUS_VAR);
1840
                return float32_default_nan;
1841
            }
1842
            float_raise( float_flag_divbyzero STATUS_VAR);
1843
            return packFloat32( zSign, 0xFF, 0 );
1844
        }
1845
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846
    }
1847
    if ( aExp == 0 ) {
1848
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850
    }
1851
    zExp = aExp - bExp + 0x7D;
1852
    aSig = ( aSig | 0x00800000 )<<7;
1853
    bSig = ( bSig | 0x00800000 )<<8;
1854
    if ( bSig <= ( aSig + aSig ) ) {
1855
        aSig >>= 1;
1856
        ++zExp;
1857
    }
1858
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859
    if ( ( zSig & 0x3F ) == 0 ) {
1860
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861
    }
1862
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863

    
1864
}
1865

    
1866
/*----------------------------------------------------------------------------
1867
| Returns the remainder of the single-precision floating-point value `a'
1868
| with respect to the corresponding value `b'.  The operation is performed
1869
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870
*----------------------------------------------------------------------------*/
1871

    
1872
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873
{
1874
    flag aSign, bSign, zSign;
1875
    int16 aExp, bExp, expDiff;
1876
    bits32 aSig, bSig;
1877
    bits32 q;
1878
    bits64 aSig64, bSig64, q64;
1879
    bits32 alternateASig;
1880
    sbits32 sigMean;
1881

    
1882
    aSig = extractFloat32Frac( a );
1883
    aExp = extractFloat32Exp( a );
1884
    aSign = extractFloat32Sign( a );
1885
    bSig = extractFloat32Frac( b );
1886
    bExp = extractFloat32Exp( b );
1887
    bSign = extractFloat32Sign( b );
1888
    if ( aExp == 0xFF ) {
1889
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1890
            return propagateFloat32NaN( a, b STATUS_VAR );
1891
        }
1892
        float_raise( float_flag_invalid STATUS_VAR);
1893
        return float32_default_nan;
1894
    }
1895
    if ( bExp == 0xFF ) {
1896
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897
        return a;
1898
    }
1899
    if ( bExp == 0 ) {
1900
        if ( bSig == 0 ) {
1901
            float_raise( float_flag_invalid STATUS_VAR);
1902
            return float32_default_nan;
1903
        }
1904
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905
    }
1906
    if ( aExp == 0 ) {
1907
        if ( aSig == 0 ) return a;
1908
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909
    }
1910
    expDiff = aExp - bExp;
1911
    aSig |= 0x00800000;
1912
    bSig |= 0x00800000;
1913
    if ( expDiff < 32 ) {
1914
        aSig <<= 8;
1915
        bSig <<= 8;
1916
        if ( expDiff < 0 ) {
1917
            if ( expDiff < -1 ) return a;
1918
            aSig >>= 1;
1919
        }
1920
        q = ( bSig <= aSig );
1921
        if ( q ) aSig -= bSig;
1922
        if ( 0 < expDiff ) {
1923
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1924
            q >>= 32 - expDiff;
1925
            bSig >>= 2;
1926
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927
        }
1928
        else {
1929
            aSig >>= 2;
1930
            bSig >>= 2;
1931
        }
1932
    }
1933
    else {
1934
        if ( bSig <= aSig ) aSig -= bSig;
1935
        aSig64 = ( (bits64) aSig )<<40;
1936
        bSig64 = ( (bits64) bSig )<<40;
1937
        expDiff -= 64;
1938
        while ( 0 < expDiff ) {
1939
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941
            aSig64 = - ( ( bSig * q64 )<<38 );
1942
            expDiff -= 62;
1943
        }
1944
        expDiff += 64;
1945
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947
        q = q64>>( 64 - expDiff );
1948
        bSig <<= 6;
1949
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950
    }
1951
    do {
1952
        alternateASig = aSig;
1953
        ++q;
1954
        aSig -= bSig;
1955
    } while ( 0 <= (sbits32) aSig );
1956
    sigMean = aSig + alternateASig;
1957
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958
        aSig = alternateASig;
1959
    }
1960
    zSign = ( (sbits32) aSig < 0 );
1961
    if ( zSign ) aSig = - aSig;
1962
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963

    
1964
}
1965

    
1966
/*----------------------------------------------------------------------------
1967
| Returns the square root of the single-precision floating-point value `a'.
1968
| The operation is performed according to the IEC/IEEE Standard for Binary
1969
| Floating-Point Arithmetic.
1970
*----------------------------------------------------------------------------*/
1971

    
1972
float32 float32_sqrt( float32 a STATUS_PARAM )
1973
{
1974
    flag aSign;
1975
    int16 aExp, zExp;
1976
    bits32 aSig, zSig;
1977
    bits64 rem, term;
1978

    
1979
    aSig = extractFloat32Frac( a );
1980
    aExp = extractFloat32Exp( a );
1981
    aSign = extractFloat32Sign( a );
1982
    if ( aExp == 0xFF ) {
1983
        if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984
        if ( ! aSign ) return a;
1985
        float_raise( float_flag_invalid STATUS_VAR);
1986
        return float32_default_nan;
1987
    }
1988
    if ( aSign ) {
1989
        if ( ( aExp | aSig ) == 0 ) return a;
1990
        float_raise( float_flag_invalid STATUS_VAR);
1991
        return float32_default_nan;
1992
    }
1993
    if ( aExp == 0 ) {
1994
        if ( aSig == 0 ) return 0;
1995
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996
    }
1997
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998
    aSig = ( aSig | 0x00800000 )<<8;
1999
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2000
    if ( ( zSig & 0x7F ) <= 5 ) {
2001
        if ( zSig < 2 ) {
2002
            zSig = 0x7FFFFFFF;
2003
            goto roundAndPack;
2004
        }
2005
        aSig >>= aExp & 1;
2006
        term = ( (bits64) zSig ) * zSig;
2007
        rem = ( ( (bits64) aSig )<<32 ) - term;
2008
        while ( (sbits64) rem < 0 ) {
2009
            --zSig;
2010
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2011
        }
2012
        zSig |= ( rem != 0 );
2013
    }
2014
    shift32RightJamming( zSig, 1, &zSig );
2015
 roundAndPack:
2016
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017

    
2018
}
2019

    
2020
/*----------------------------------------------------------------------------
2021
| Returns 1 if the single-precision floating-point value `a' is equal to
2022
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2023
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024
*----------------------------------------------------------------------------*/
2025

    
2026
flag float32_eq( float32 a, float32 b STATUS_PARAM )
2027
{
2028

    
2029
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031
       ) {
2032
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2033
            float_raise( float_flag_invalid STATUS_VAR);
2034
        }
2035
        return 0;
2036
    }
2037
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2038

    
2039
}
2040

    
2041
/*----------------------------------------------------------------------------
2042
| Returns 1 if the single-precision floating-point value `a' is less than
2043
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2044
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045
| Arithmetic.
2046
*----------------------------------------------------------------------------*/
2047

    
2048
flag float32_le( float32 a, float32 b STATUS_PARAM )
2049
{
2050
    flag aSign, bSign;
2051

    
2052
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054
       ) {
2055
        float_raise( float_flag_invalid STATUS_VAR);
2056
        return 0;
2057
    }
2058
    aSign = extractFloat32Sign( a );
2059
    bSign = extractFloat32Sign( b );
2060
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2061
    return ( a == b ) || ( aSign ^ ( a < b ) );
2062

    
2063
}
2064

    
2065
/*----------------------------------------------------------------------------
2066
| Returns 1 if the single-precision floating-point value `a' is less than
2067
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2068
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069
*----------------------------------------------------------------------------*/
2070

    
2071
flag float32_lt( float32 a, float32 b STATUS_PARAM )
2072
{
2073
    flag aSign, bSign;
2074

    
2075
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077
       ) {
2078
        float_raise( float_flag_invalid STATUS_VAR);
2079
        return 0;
2080
    }
2081
    aSign = extractFloat32Sign( a );
2082
    bSign = extractFloat32Sign( b );
2083
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2084
    return ( a != b ) && ( aSign ^ ( a < b ) );
2085

    
2086
}
2087

    
2088
/*----------------------------------------------------------------------------
2089
| Returns 1 if the single-precision floating-point value `a' is equal to
2090
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2091
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2092
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093
*----------------------------------------------------------------------------*/
2094

    
2095
flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096
{
2097

    
2098
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100
       ) {
2101
        float_raise( float_flag_invalid STATUS_VAR);
2102
        return 0;
2103
    }
2104
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2105

    
2106
}
2107

    
2108
/*----------------------------------------------------------------------------
2109
| Returns 1 if the single-precision floating-point value `a' is less than or
2110
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2111
| cause an exception.  Otherwise, the comparison is performed according to the
2112
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113
*----------------------------------------------------------------------------*/
2114

    
2115
flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116
{
2117
    flag aSign, bSign;
2118

    
2119
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121
       ) {
2122
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2123
            float_raise( float_flag_invalid STATUS_VAR);
2124
        }
2125
        return 0;
2126
    }
2127
    aSign = extractFloat32Sign( a );
2128
    bSign = extractFloat32Sign( b );
2129
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2130
    return ( a == b ) || ( aSign ^ ( a < b ) );
2131

    
2132
}
2133

    
2134
/*----------------------------------------------------------------------------
2135
| Returns 1 if the single-precision floating-point value `a' is less than
2136
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2137
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2138
| Standard for Binary Floating-Point Arithmetic.
2139
*----------------------------------------------------------------------------*/
2140

    
2141
flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142
{
2143
    flag aSign, bSign;
2144

    
2145
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147
       ) {
2148
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149
            float_raise( float_flag_invalid STATUS_VAR);
2150
        }
2151
        return 0;
2152
    }
2153
    aSign = extractFloat32Sign( a );
2154
    bSign = extractFloat32Sign( b );
2155
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2156
    return ( a != b ) && ( aSign ^ ( a < b ) );
2157

    
2158
}
2159

    
2160
/*----------------------------------------------------------------------------
2161
| Returns the result of converting the double-precision floating-point value
2162
| `a' to the 32-bit two's complement integer format.  The conversion is
2163
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2164
| Arithmetic---which means in particular that the conversion is rounded
2165
| according to the current rounding mode.  If `a' is a NaN, the largest
2166
| positive integer is returned.  Otherwise, if the conversion overflows, the
2167
| largest integer with the same sign as `a' is returned.
2168
*----------------------------------------------------------------------------*/
2169

    
2170
int32 float64_to_int32( float64 a STATUS_PARAM )
2171
{
2172
    flag aSign;
2173
    int16 aExp, shiftCount;
2174
    bits64 aSig;
2175

    
2176
    aSig = extractFloat64Frac( a );
2177
    aExp = extractFloat64Exp( a );
2178
    aSign = extractFloat64Sign( a );
2179
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2181
    shiftCount = 0x42C - aExp;
2182
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184

    
2185
}
2186

    
2187
/*----------------------------------------------------------------------------
2188
| Returns the result of converting the double-precision floating-point value
2189
| `a' to the 32-bit two's complement integer format.  The conversion is
2190
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2191
| Arithmetic, except that the conversion is always rounded toward zero.
2192
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2193
| the conversion overflows, the largest integer with the same sign as `a' is
2194
| returned.
2195
*----------------------------------------------------------------------------*/
2196

    
2197
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198
{
2199
    flag aSign;
2200
    int16 aExp, shiftCount;
2201
    bits64 aSig, savedASig;
2202
    int32 z;
2203

    
2204
    aSig = extractFloat64Frac( a );
2205
    aExp = extractFloat64Exp( a );
2206
    aSign = extractFloat64Sign( a );
2207
    if ( 0x41E < aExp ) {
2208
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209
        goto invalid;
2210
    }
2211
    else if ( aExp < 0x3FF ) {
2212
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2213
        return 0;
2214
    }
2215
    aSig |= LIT64( 0x0010000000000000 );
2216
    shiftCount = 0x433 - aExp;
2217
    savedASig = aSig;
2218
    aSig >>= shiftCount;
2219
    z = aSig;
2220
    if ( aSign ) z = - z;
2221
    if ( ( z < 0 ) ^ aSign ) {
2222
 invalid:
2223
        float_raise( float_flag_invalid STATUS_VAR);
2224
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225
    }
2226
    if ( ( aSig<<shiftCount ) != savedASig ) {
2227
        STATUS(float_exception_flags) |= float_flag_inexact;
2228
    }
2229
    return z;
2230

    
2231
}
2232

    
2233
/*----------------------------------------------------------------------------
2234
| Returns the result of converting the double-precision floating-point value
2235
| `a' to the 64-bit two's complement integer format.  The conversion is
2236
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237
| Arithmetic---which means in particular that the conversion is rounded
2238
| according to the current rounding mode.  If `a' is a NaN, the largest
2239
| positive integer is returned.  Otherwise, if the conversion overflows, the
2240
| largest integer with the same sign as `a' is returned.
2241
*----------------------------------------------------------------------------*/
2242

    
2243
int64 float64_to_int64( float64 a STATUS_PARAM )
2244
{
2245
    flag aSign;
2246
    int16 aExp, shiftCount;
2247
    bits64 aSig, aSigExtra;
2248

    
2249
    aSig = extractFloat64Frac( a );
2250
    aExp = extractFloat64Exp( a );
2251
    aSign = extractFloat64Sign( a );
2252
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2253
    shiftCount = 0x433 - aExp;
2254
    if ( shiftCount <= 0 ) {
2255
        if ( 0x43E < aExp ) {
2256
            float_raise( float_flag_invalid STATUS_VAR);
2257
            if (    ! aSign
2258
                 || (    ( aExp == 0x7FF )
2259
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2260
               ) {
2261
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2262
            }
2263
            return (sbits64) LIT64( 0x8000000000000000 );
2264
        }
2265
        aSigExtra = 0;
2266
        aSig <<= - shiftCount;
2267
    }
2268
    else {
2269
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270
    }
2271
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272

    
2273
}
2274

    
2275
/*----------------------------------------------------------------------------
2276
| Returns the result of converting the double-precision floating-point value
2277
| `a' to the 64-bit two's complement integer format.  The conversion is
2278
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2279
| Arithmetic, except that the conversion is always rounded toward zero.
2280
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2281
| the conversion overflows, the largest integer with the same sign as `a' is
2282
| returned.
2283
*----------------------------------------------------------------------------*/
2284

    
2285
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286
{
2287
    flag aSign;
2288
    int16 aExp, shiftCount;
2289
    bits64 aSig;
2290
    int64 z;
2291

    
2292
    aSig = extractFloat64Frac( a );
2293
    aExp = extractFloat64Exp( a );
2294
    aSign = extractFloat64Sign( a );
2295
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2296
    shiftCount = aExp - 0x433;
2297
    if ( 0 <= shiftCount ) {
2298
        if ( 0x43E <= aExp ) {
2299
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300
                float_raise( float_flag_invalid STATUS_VAR);
2301
                if (    ! aSign
2302
                     || (    ( aExp == 0x7FF )
2303
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2304
                   ) {
2305
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2306
                }
2307
            }
2308
            return (sbits64) LIT64( 0x8000000000000000 );
2309
        }
2310
        z = aSig<<shiftCount;
2311
    }
2312
    else {
2313
        if ( aExp < 0x3FE ) {
2314
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315
            return 0;
2316
        }
2317
        z = aSig>>( - shiftCount );
2318
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319
            STATUS(float_exception_flags) |= float_flag_inexact;
2320
        }
2321
    }
2322
    if ( aSign ) z = - z;
2323
    return z;
2324

    
2325
}
2326

    
2327
/*----------------------------------------------------------------------------
2328
| Returns the result of converting the double-precision floating-point value
2329
| `a' to the single-precision floating-point format.  The conversion is
2330
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2331
| Arithmetic.
2332
*----------------------------------------------------------------------------*/
2333

    
2334
float32 float64_to_float32( float64 a STATUS_PARAM )
2335
{
2336
    flag aSign;
2337
    int16 aExp;
2338
    bits64 aSig;
2339
    bits32 zSig;
2340

    
2341
    aSig = extractFloat64Frac( a );
2342
    aExp = extractFloat64Exp( a );
2343
    aSign = extractFloat64Sign( a );
2344
    if ( aExp == 0x7FF ) {
2345
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346
        return packFloat32( aSign, 0xFF, 0 );
2347
    }
2348
    shift64RightJamming( aSig, 22, &aSig );
2349
    zSig = aSig;
2350
    if ( aExp || zSig ) {
2351
        zSig |= 0x40000000;
2352
        aExp -= 0x381;
2353
    }
2354
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355

    
2356
}
2357

    
2358
#ifdef FLOATX80
2359

    
2360
/*----------------------------------------------------------------------------
2361
| Returns the result of converting the double-precision floating-point value
2362
| `a' to the extended double-precision floating-point format.  The conversion
2363
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364
| Arithmetic.
2365
*----------------------------------------------------------------------------*/
2366

    
2367
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368
{
2369
    flag aSign;
2370
    int16 aExp;
2371
    bits64 aSig;
2372

    
2373
    aSig = extractFloat64Frac( a );
2374
    aExp = extractFloat64Exp( a );
2375
    aSign = extractFloat64Sign( a );
2376
    if ( aExp == 0x7FF ) {
2377
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379
    }
2380
    if ( aExp == 0 ) {
2381
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383
    }
2384
    return
2385
        packFloatx80(
2386
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2387

    
2388
}
2389

    
2390
#endif
2391

    
2392
#ifdef FLOAT128
2393

    
2394
/*----------------------------------------------------------------------------
2395
| Returns the result of converting the double-precision floating-point value
2396
| `a' to the quadruple-precision floating-point format.  The conversion is
2397
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2398
| Arithmetic.
2399
*----------------------------------------------------------------------------*/
2400

    
2401
float128 float64_to_float128( float64 a STATUS_PARAM )
2402
{
2403
    flag aSign;
2404
    int16 aExp;
2405
    bits64 aSig, zSig0, zSig1;
2406

    
2407
    aSig = extractFloat64Frac( a );
2408
    aExp = extractFloat64Exp( a );
2409
    aSign = extractFloat64Sign( a );
2410
    if ( aExp == 0x7FF ) {
2411
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2413
    }
2414
    if ( aExp == 0 ) {
2415
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417
        --aExp;
2418
    }
2419
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421

    
2422
}
2423

    
2424
#endif
2425

    
2426
/*----------------------------------------------------------------------------
2427
| Rounds the double-precision floating-point value `a' to an integer, and
2428
| returns the result as a double-precision floating-point value.  The
2429
| operation is performed according to the IEC/IEEE Standard for Binary
2430
| Floating-Point Arithmetic.
2431
*----------------------------------------------------------------------------*/
2432

    
2433
float64 float64_round_to_int( float64 a STATUS_PARAM )
2434
{
2435
    flag aSign;
2436
    int16 aExp;
2437
    bits64 lastBitMask, roundBitsMask;
2438
    int8 roundingMode;
2439
    float64 z;
2440

    
2441
    aExp = extractFloat64Exp( a );
2442
    if ( 0x433 <= aExp ) {
2443
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444
            return propagateFloat64NaN( a, a STATUS_VAR );
2445
        }
2446
        return a;
2447
    }
2448
    if ( aExp < 0x3FF ) {
2449
        if ( (bits64) ( a<<1 ) == 0 ) return a;
2450
        STATUS(float_exception_flags) |= float_flag_inexact;
2451
        aSign = extractFloat64Sign( a );
2452
        switch ( STATUS(float_rounding_mode) ) {
2453
         case float_round_nearest_even:
2454
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455
                return packFloat64( aSign, 0x3FF, 0 );
2456
            }
2457
            break;
2458
         case float_round_down:
2459
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460
         case float_round_up:
2461
            return
2462
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463
        }
2464
        return packFloat64( aSign, 0, 0 );
2465
    }
2466
    lastBitMask = 1;
2467
    lastBitMask <<= 0x433 - aExp;
2468
    roundBitsMask = lastBitMask - 1;
2469
    z = a;
2470
    roundingMode = STATUS(float_rounding_mode);
2471
    if ( roundingMode == float_round_nearest_even ) {
2472
        z += lastBitMask>>1;
2473
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474
    }
2475
    else if ( roundingMode != float_round_to_zero ) {
2476
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477
            z += roundBitsMask;
2478
        }
2479
    }
2480
    z &= ~ roundBitsMask;
2481
    if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2482
    return z;
2483

    
2484
}
2485

    
2486
float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2487
{
2488
    int oldmode;
2489
    float64 res;
2490
    oldmode = STATUS(float_rounding_mode);
2491
    STATUS(float_rounding_mode) = float_round_to_zero;
2492
    res = float64_round_to_int(a STATUS_VAR);
2493
    STATUS(float_rounding_mode) = oldmode;
2494
    return res;
2495
}
2496

    
2497
/*----------------------------------------------------------------------------
2498
| Returns the result of adding the absolute values of the double-precision
2499
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2500
| before being returned.  `zSign' is ignored if the result is a NaN.
2501
| The addition is performed according to the IEC/IEEE Standard for Binary
2502
| Floating-Point Arithmetic.
2503
*----------------------------------------------------------------------------*/
2504

    
2505
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2506
{
2507
    int16 aExp, bExp, zExp;
2508
    bits64 aSig, bSig, zSig;
2509
    int16 expDiff;
2510

    
2511
    aSig = extractFloat64Frac( a );
2512
    aExp = extractFloat64Exp( a );
2513
    bSig = extractFloat64Frac( b );
2514
    bExp = extractFloat64Exp( b );
2515
    expDiff = aExp - bExp;
2516
    aSig <<= 9;
2517
    bSig <<= 9;
2518
    if ( 0 < expDiff ) {
2519
        if ( aExp == 0x7FF ) {
2520
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2521
            return a;
2522
        }
2523
        if ( bExp == 0 ) {
2524
            --expDiff;
2525
        }
2526
        else {
2527
            bSig |= LIT64( 0x2000000000000000 );
2528
        }
2529
        shift64RightJamming( bSig, expDiff, &bSig );
2530
        zExp = aExp;
2531
    }
2532
    else if ( expDiff < 0 ) {
2533
        if ( bExp == 0x7FF ) {
2534
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2535
            return packFloat64( zSign, 0x7FF, 0 );
2536
        }
2537
        if ( aExp == 0 ) {
2538
            ++expDiff;
2539
        }
2540
        else {
2541
            aSig |= LIT64( 0x2000000000000000 );
2542
        }
2543
        shift64RightJamming( aSig, - expDiff, &aSig );
2544
        zExp = bExp;
2545
    }
2546
    else {
2547
        if ( aExp == 0x7FF ) {
2548
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549
            return a;
2550
        }
2551
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2552
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2553
        zExp = aExp;
2554
        goto roundAndPack;
2555
    }
2556
    aSig |= LIT64( 0x2000000000000000 );
2557
    zSig = ( aSig + bSig )<<1;
2558
    --zExp;
2559
    if ( (sbits64) zSig < 0 ) {
2560
        zSig = aSig + bSig;
2561
        ++zExp;
2562
    }
2563
 roundAndPack:
2564
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2565

    
2566
}
2567

    
2568
/*----------------------------------------------------------------------------
2569
| Returns the result of subtracting the absolute values of the double-
2570
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2571
| difference is negated before being returned.  `zSign' is ignored if the
2572
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2573
| Standard for Binary Floating-Point Arithmetic.
2574
*----------------------------------------------------------------------------*/
2575

    
2576
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2577
{
2578
    int16 aExp, bExp, zExp;
2579
    bits64 aSig, bSig, zSig;
2580
    int16 expDiff;
2581

    
2582
    aSig = extractFloat64Frac( a );
2583
    aExp = extractFloat64Exp( a );
2584
    bSig = extractFloat64Frac( b );
2585
    bExp = extractFloat64Exp( b );
2586
    expDiff = aExp - bExp;
2587
    aSig <<= 10;
2588
    bSig <<= 10;
2589
    if ( 0 < expDiff ) goto aExpBigger;
2590
    if ( expDiff < 0 ) goto bExpBigger;
2591
    if ( aExp == 0x7FF ) {
2592
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2593
        float_raise( float_flag_invalid STATUS_VAR);
2594
        return float64_default_nan;
2595
    }
2596
    if ( aExp == 0 ) {
2597
        aExp = 1;
2598
        bExp = 1;
2599
    }
2600
    if ( bSig < aSig ) goto aBigger;
2601
    if ( aSig < bSig ) goto bBigger;
2602
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2603
 bExpBigger:
2604
    if ( bExp == 0x7FF ) {
2605
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2606
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2607
    }
2608
    if ( aExp == 0 ) {
2609
        ++expDiff;
2610
    }
2611
    else {
2612
        aSig |= LIT64( 0x4000000000000000 );
2613
    }
2614
    shift64RightJamming( aSig, - expDiff, &aSig );
2615
    bSig |= LIT64( 0x4000000000000000 );
2616
 bBigger:
2617
    zSig = bSig - aSig;
2618
    zExp = bExp;
2619
    zSign ^= 1;
2620
    goto normalizeRoundAndPack;
2621
 aExpBigger:
2622
    if ( aExp == 0x7FF ) {
2623
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624
        return a;
2625
    }
2626
    if ( bExp == 0 ) {
2627
        --expDiff;
2628
    }
2629
    else {
2630
        bSig |= LIT64( 0x4000000000000000 );
2631
    }
2632
    shift64RightJamming( bSig, expDiff, &bSig );
2633
    aSig |= LIT64( 0x4000000000000000 );
2634
 aBigger:
2635
    zSig = aSig - bSig;
2636
    zExp = aExp;
2637
 normalizeRoundAndPack:
2638
    --zExp;
2639
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2640

    
2641
}
2642

    
2643
/*----------------------------------------------------------------------------
2644
| Returns the result of adding the double-precision floating-point values `a'
2645
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2646
| Binary Floating-Point Arithmetic.
2647
*----------------------------------------------------------------------------*/
2648

    
2649
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2650
{
2651
    flag aSign, bSign;
2652

    
2653
    aSign = extractFloat64Sign( a );
2654
    bSign = extractFloat64Sign( b );
2655
    if ( aSign == bSign ) {
2656
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2657
    }
2658
    else {
2659
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2660
    }
2661

    
2662
}
2663

    
2664
/*----------------------------------------------------------------------------
2665
| Returns the result of subtracting the double-precision floating-point values
2666
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2667
| for Binary Floating-Point Arithmetic.
2668
*----------------------------------------------------------------------------*/
2669

    
2670
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2671
{
2672
    flag aSign, bSign;
2673

    
2674
    aSign = extractFloat64Sign( a );
2675
    bSign = extractFloat64Sign( b );
2676
    if ( aSign == bSign ) {
2677
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2678
    }
2679
    else {
2680
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2681
    }
2682

    
2683
}
2684

    
2685
/*----------------------------------------------------------------------------
2686
| Returns the result of multiplying the double-precision floating-point values
2687
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2688
| for Binary Floating-Point Arithmetic.
2689
*----------------------------------------------------------------------------*/
2690

    
2691
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2692
{
2693
    flag aSign, bSign, zSign;
2694
    int16 aExp, bExp, zExp;
2695
    bits64 aSig, bSig, zSig0, zSig1;
2696

    
2697
    aSig = extractFloat64Frac( a );
2698
    aExp = extractFloat64Exp( a );
2699
    aSign = extractFloat64Sign( a );
2700
    bSig = extractFloat64Frac( b );
2701
    bExp = extractFloat64Exp( b );
2702
    bSign = extractFloat64Sign( b );
2703
    zSign = aSign ^ bSign;
2704
    if ( aExp == 0x7FF ) {
2705
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2706
            return propagateFloat64NaN( a, b STATUS_VAR );
2707
        }
2708
        if ( ( bExp | bSig ) == 0 ) {
2709
            float_raise( float_flag_invalid STATUS_VAR);
2710
            return float64_default_nan;
2711
        }
2712
        return packFloat64( zSign, 0x7FF, 0 );
2713
    }
2714
    if ( bExp == 0x7FF ) {
2715
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2716
        if ( ( aExp | aSig ) == 0 ) {
2717
            float_raise( float_flag_invalid STATUS_VAR);
2718
            return float64_default_nan;
2719
        }
2720
        return packFloat64( zSign, 0x7FF, 0 );
2721
    }
2722
    if ( aExp == 0 ) {
2723
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2724
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2725
    }
2726
    if ( bExp == 0 ) {
2727
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2728
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2729
    }
2730
    zExp = aExp + bExp - 0x3FF;
2731
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2732
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2733
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2734
    zSig0 |= ( zSig1 != 0 );
2735
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2736
        zSig0 <<= 1;
2737
        --zExp;
2738
    }
2739
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2740

    
2741
}
2742

    
2743
/*----------------------------------------------------------------------------
2744
| Returns the result of dividing the double-precision floating-point value `a'
2745
| by the corresponding value `b'.  The operation is performed according to
2746
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2747
*----------------------------------------------------------------------------*/
2748

    
2749
float64 float64_div( float64 a, float64 b STATUS_PARAM )
2750
{
2751
    flag aSign, bSign, zSign;
2752
    int16 aExp, bExp, zExp;
2753
    bits64 aSig, bSig, zSig;
2754
    bits64 rem0, rem1;
2755
    bits64 term0, term1;
2756

    
2757
    aSig = extractFloat64Frac( a );
2758
    aExp = extractFloat64Exp( a );
2759
    aSign = extractFloat64Sign( a );
2760
    bSig = extractFloat64Frac( b );
2761
    bExp = extractFloat64Exp( b );
2762
    bSign = extractFloat64Sign( b );
2763
    zSign = aSign ^ bSign;
2764
    if ( aExp == 0x7FF ) {
2765
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2766
        if ( bExp == 0x7FF ) {
2767
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2768
            float_raise( float_flag_invalid STATUS_VAR);
2769
            return float64_default_nan;
2770
        }
2771
        return packFloat64( zSign, 0x7FF, 0 );
2772
    }
2773
    if ( bExp == 0x7FF ) {
2774
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775
        return packFloat64( zSign, 0, 0 );
2776
    }
2777
    if ( bExp == 0 ) {
2778
        if ( bSig == 0 ) {
2779
            if ( ( aExp | aSig ) == 0 ) {
2780
                float_raise( float_flag_invalid STATUS_VAR);
2781
                return float64_default_nan;
2782
            }
2783
            float_raise( float_flag_divbyzero STATUS_VAR);
2784
            return packFloat64( zSign, 0x7FF, 0 );
2785
        }
2786
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2787
    }
2788
    if ( aExp == 0 ) {
2789
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2790
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2791
    }
2792
    zExp = aExp - bExp + 0x3FD;
2793
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2794
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2795
    if ( bSig <= ( aSig + aSig ) ) {
2796
        aSig >>= 1;
2797
        ++zExp;
2798
    }
2799
    zSig = estimateDiv128To64( aSig, 0, bSig );
2800
    if ( ( zSig & 0x1FF ) <= 2 ) {
2801
        mul64To128( bSig, zSig, &term0, &term1 );
2802
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2803
        while ( (sbits64) rem0 < 0 ) {
2804
            --zSig;
2805
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2806
        }
2807
        zSig |= ( rem1 != 0 );
2808
    }
2809
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2810

    
2811
}
2812

    
2813
/*----------------------------------------------------------------------------
2814
| Returns the remainder of the double-precision floating-point value `a'
2815
| with respect to the corresponding value `b'.  The operation is performed
2816
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2817
*----------------------------------------------------------------------------*/
2818

    
2819
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2820
{
2821
    flag aSign, bSign, zSign;
2822
    int16 aExp, bExp, expDiff;
2823
    bits64 aSig, bSig;
2824
    bits64 q, alternateASig;
2825
    sbits64 sigMean;
2826

    
2827
    aSig = extractFloat64Frac( a );
2828
    aExp = extractFloat64Exp( a );
2829
    aSign = extractFloat64Sign( a );
2830
    bSig = extractFloat64Frac( b );
2831
    bExp = extractFloat64Exp( b );
2832
    bSign = extractFloat64Sign( b );
2833
    if ( aExp == 0x7FF ) {
2834
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2835
            return propagateFloat64NaN( a, b STATUS_VAR );
2836
        }
2837
        float_raise( float_flag_invalid STATUS_VAR);
2838
        return float64_default_nan;
2839
    }
2840
    if ( bExp == 0x7FF ) {
2841
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2842
        return a;
2843
    }
2844
    if ( bExp == 0 ) {
2845
        if ( bSig == 0 ) {
2846
            float_raise( float_flag_invalid STATUS_VAR);
2847
            return float64_default_nan;
2848
        }
2849
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2850
    }
2851
    if ( aExp == 0 ) {
2852
        if ( aSig == 0 ) return a;
2853
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2854
    }
2855
    expDiff = aExp - bExp;
2856
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2857
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2858
    if ( expDiff < 0 ) {
2859
        if ( expDiff < -1 ) return a;
2860
        aSig >>= 1;
2861
    }
2862
    q = ( bSig <= aSig );
2863
    if ( q ) aSig -= bSig;
2864
    expDiff -= 64;
2865
    while ( 0 < expDiff ) {
2866
        q = estimateDiv128To64( aSig, 0, bSig );
2867
        q = ( 2 < q ) ? q - 2 : 0;
2868
        aSig = - ( ( bSig>>2 ) * q );
2869
        expDiff -= 62;
2870
    }
2871
    expDiff += 64;
2872
    if ( 0 < expDiff ) {
2873
        q = estimateDiv128To64( aSig, 0, bSig );
2874
        q = ( 2 < q ) ? q - 2 : 0;
2875
        q >>= 64 - expDiff;
2876
        bSig >>= 2;
2877
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2878
    }
2879
    else {
2880
        aSig >>= 2;
2881
        bSig >>= 2;
2882
    }
2883
    do {
2884
        alternateASig = aSig;
2885
        ++q;
2886
        aSig -= bSig;
2887
    } while ( 0 <= (sbits64) aSig );
2888
    sigMean = aSig + alternateASig;
2889
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2890
        aSig = alternateASig;
2891
    }
2892
    zSign = ( (sbits64) aSig < 0 );
2893
    if ( zSign ) aSig = - aSig;
2894
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2895

    
2896
}
2897

    
2898
/*----------------------------------------------------------------------------
2899
| Returns the square root of the double-precision floating-point value `a'.
2900
| The operation is performed according to the IEC/IEEE Standard for Binary
2901
| Floating-Point Arithmetic.
2902
*----------------------------------------------------------------------------*/
2903

    
2904
float64 float64_sqrt( float64 a STATUS_PARAM )
2905
{
2906
    flag aSign;
2907
    int16 aExp, zExp;
2908
    bits64 aSig, zSig, doubleZSig;
2909
    bits64 rem0, rem1, term0, term1;
2910

    
2911
    aSig = extractFloat64Frac( a );
2912
    aExp = extractFloat64Exp( a );
2913
    aSign = extractFloat64Sign( a );
2914
    if ( aExp == 0x7FF ) {
2915
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2916
        if ( ! aSign ) return a;
2917
        float_raise( float_flag_invalid STATUS_VAR);
2918
        return float64_default_nan;
2919
    }
2920
    if ( aSign ) {
2921
        if ( ( aExp | aSig ) == 0 ) return a;
2922
        float_raise( float_flag_invalid STATUS_VAR);
2923
        return float64_default_nan;
2924
    }
2925
    if ( aExp == 0 ) {
2926
        if ( aSig == 0 ) return 0;
2927
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2928
    }
2929
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2930
    aSig |= LIT64( 0x0010000000000000 );
2931
    zSig = estimateSqrt32( aExp, aSig>>21 );
2932
    aSig <<= 9 - ( aExp & 1 );
2933
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2934
    if ( ( zSig & 0x1FF ) <= 5 ) {
2935
        doubleZSig = zSig<<1;
2936
        mul64To128( zSig, zSig, &term0, &term1 );
2937
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2938
        while ( (sbits64) rem0 < 0 ) {
2939
            --zSig;
2940
            doubleZSig -= 2;
2941
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2942
        }
2943
        zSig |= ( ( rem0 | rem1 ) != 0 );
2944
    }
2945
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2946

    
2947
}
2948

    
2949
/*----------------------------------------------------------------------------
2950
| Returns 1 if the double-precision floating-point value `a' is equal to the
2951
| corresponding value `b', and 0 otherwise.  The comparison is performed
2952
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2953
*----------------------------------------------------------------------------*/
2954

    
2955
flag float64_eq( float64 a, float64 b STATUS_PARAM )
2956
{
2957

    
2958
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2959
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2960
       ) {
2961
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2962
            float_raise( float_flag_invalid STATUS_VAR);
2963
        }
2964
        return 0;
2965
    }
2966
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2967

    
2968
}
2969

    
2970
/*----------------------------------------------------------------------------
2971
| Returns 1 if the double-precision floating-point value `a' is less than or
2972
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
2973
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2974
| Arithmetic.
2975
*----------------------------------------------------------------------------*/
2976

    
2977
flag float64_le( float64 a, float64 b STATUS_PARAM )
2978
{
2979
    flag aSign, bSign;
2980

    
2981
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2982
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2983
       ) {
2984
        float_raise( float_flag_invalid STATUS_VAR);
2985
        return 0;
2986
    }
2987
    aSign = extractFloat64Sign( a );
2988
    bSign = extractFloat64Sign( b );
2989
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2990
    return ( a == b ) || ( aSign ^ ( a < b ) );
2991

    
2992
}
2993

    
2994
/*----------------------------------------------------------------------------
2995
| Returns 1 if the double-precision floating-point value `a' is less than
2996
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2997
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2998
*----------------------------------------------------------------------------*/
2999

    
3000
flag float64_lt( float64 a, float64 b STATUS_PARAM )
3001
{
3002
    flag aSign, bSign;
3003

    
3004
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3005
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3006
       ) {
3007
        float_raise( float_flag_invalid STATUS_VAR);
3008
        return 0;
3009
    }
3010
    aSign = extractFloat64Sign( a );
3011
    bSign = extractFloat64Sign( b );
3012
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3013
    return ( a != b ) && ( aSign ^ ( a < b ) );
3014

    
3015
}
3016

    
3017
/*----------------------------------------------------------------------------
3018
| Returns 1 if the double-precision floating-point value `a' is equal to the
3019
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3020
| if either operand is a NaN.  Otherwise, the comparison is performed
3021
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3022
*----------------------------------------------------------------------------*/
3023

    
3024
flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3025
{
3026

    
3027
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3028
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3029
       ) {
3030
        float_raise( float_flag_invalid STATUS_VAR);
3031
        return 0;
3032
    }
3033
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3034

    
3035
}
3036

    
3037
/*----------------------------------------------------------------------------
3038
| Returns 1 if the double-precision floating-point value `a' is less than or
3039
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3040
| cause an exception.  Otherwise, the comparison is performed according to the
3041
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3042
*----------------------------------------------------------------------------*/
3043

    
3044
flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3045
{
3046
    flag aSign, bSign;
3047

    
3048
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3049
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3050
       ) {
3051
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3052
            float_raise( float_flag_invalid STATUS_VAR);
3053
        }
3054
        return 0;
3055
    }
3056
    aSign = extractFloat64Sign( a );
3057
    bSign = extractFloat64Sign( b );
3058
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3059
    return ( a == b ) || ( aSign ^ ( a < b ) );
3060

    
3061
}
3062

    
3063
/*----------------------------------------------------------------------------
3064
| Returns 1 if the double-precision floating-point value `a' is less than
3065
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3066
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3067
| Standard for Binary Floating-Point Arithmetic.
3068
*----------------------------------------------------------------------------*/
3069

    
3070
flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3071
{
3072
    flag aSign, bSign;
3073

    
3074
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3075
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3076
       ) {
3077
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3078
            float_raise( float_flag_invalid STATUS_VAR);
3079
        }
3080
        return 0;
3081
    }
3082
    aSign = extractFloat64Sign( a );
3083
    bSign = extractFloat64Sign( b );
3084
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3085
    return ( a != b ) && ( aSign ^ ( a < b ) );
3086

    
3087
}
3088

    
3089
#ifdef FLOATX80
3090

    
3091
/*----------------------------------------------------------------------------
3092
| Returns the result of converting the extended double-precision floating-
3093
| point value `a' to the 32-bit two's complement integer format.  The
3094
| conversion is performed according to the IEC/IEEE Standard for Binary
3095
| Floating-Point Arithmetic---which means in particular that the conversion
3096
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3097
| largest positive integer is returned.  Otherwise, if the conversion
3098
| overflows, the largest integer with the same sign as `a' is returned.
3099
*----------------------------------------------------------------------------*/
3100

    
3101
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3102
{
3103
    flag aSign;
3104
    int32 aExp, shiftCount;
3105
    bits64 aSig;
3106

    
3107
    aSig = extractFloatx80Frac( a );
3108
    aExp = extractFloatx80Exp( a );
3109
    aSign = extractFloatx80Sign( a );
3110
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3111
    shiftCount = 0x4037 - aExp;
3112
    if ( shiftCount <= 0 ) shiftCount = 1;
3113
    shift64RightJamming( aSig, shiftCount, &aSig );
3114
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3115

    
3116
}
3117

    
3118
/*----------------------------------------------------------------------------
3119
| Returns the result of converting the extended double-precision floating-
3120
| point value `a' to the 32-bit two's complement integer format.  The
3121
| conversion is performed according to the IEC/IEEE Standard for Binary
3122
| Floating-Point Arithmetic, except that the conversion is always rounded
3123
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3124
| Otherwise, if the conversion overflows, the largest integer with the same
3125
| sign as `a' is returned.
3126
*----------------------------------------------------------------------------*/
3127

    
3128
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3129
{
3130
    flag aSign;
3131
    int32 aExp, shiftCount;
3132
    bits64 aSig, savedASig;
3133
    int32 z;
3134

    
3135
    aSig = extractFloatx80Frac( a );
3136
    aExp = extractFloatx80Exp( a );
3137
    aSign = extractFloatx80Sign( a );
3138
    if ( 0x401E < aExp ) {
3139
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3140
        goto invalid;
3141
    }
3142
    else if ( aExp < 0x3FFF ) {
3143
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3144
        return 0;
3145
    }
3146
    shiftCount = 0x403E - aExp;
3147
    savedASig = aSig;
3148
    aSig >>= shiftCount;
3149
    z = aSig;
3150
    if ( aSign ) z = - z;
3151
    if ( ( z < 0 ) ^ aSign ) {
3152
 invalid:
3153
        float_raise( float_flag_invalid STATUS_VAR);
3154
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3155
    }
3156
    if ( ( aSig<<shiftCount ) != savedASig ) {
3157
        STATUS(float_exception_flags) |= float_flag_inexact;
3158
    }
3159
    return z;
3160

    
3161
}
3162

    
3163
/*----------------------------------------------------------------------------
3164
| Returns the result of converting the extended double-precision floating-
3165
| point value `a' to the 64-bit two's complement integer format.  The
3166
| conversion is performed according to the IEC/IEEE Standard for Binary
3167
| Floating-Point Arithmetic---which means in particular that the conversion
3168
| is rounded according to the current rounding mode.  If `a' is a NaN,
3169
| the largest positive integer is returned.  Otherwise, if the conversion
3170
| overflows, the largest integer with the same sign as `a' is returned.
3171
*----------------------------------------------------------------------------*/
3172

    
3173
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3174
{
3175
    flag aSign;
3176
    int32 aExp, shiftCount;
3177
    bits64 aSig, aSigExtra;
3178

    
3179
    aSig = extractFloatx80Frac( a );
3180
    aExp = extractFloatx80Exp( a );
3181
    aSign = extractFloatx80Sign( a );
3182
    shiftCount = 0x403E - aExp;
3183
    if ( shiftCount <= 0 ) {
3184
        if ( shiftCount ) {
3185
            float_raise( float_flag_invalid STATUS_VAR);
3186
            if (    ! aSign
3187
                 || (    ( aExp == 0x7FFF )
3188
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3189
               ) {
3190
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3191
            }
3192
            return (sbits64) LIT64( 0x8000000000000000 );
3193
        }
3194
        aSigExtra = 0;
3195
    }
3196
    else {
3197
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3198
    }
3199
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3200

    
3201
}
3202

    
3203
/*----------------------------------------------------------------------------
3204
| Returns the result of converting the extended double-precision floating-
3205
| point value `a' to the 64-bit two's complement integer format.  The
3206
| conversion is performed according to the IEC/IEEE Standard for Binary
3207
| Floating-Point Arithmetic, except that the conversion is always rounded
3208
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3209
| Otherwise, if the conversion overflows, the largest integer with the same
3210
| sign as `a' is returned.
3211
*----------------------------------------------------------------------------*/
3212

    
3213
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3214
{
3215
    flag aSign;
3216
    int32 aExp, shiftCount;
3217
    bits64 aSig;
3218
    int64 z;
3219

    
3220
    aSig = extractFloatx80Frac( a );
3221
    aExp = extractFloatx80Exp( a );
3222
    aSign = extractFloatx80Sign( a );
3223
    shiftCount = aExp - 0x403E;
3224
    if ( 0 <= shiftCount ) {
3225
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3226
        if ( ( a.high != 0xC03E ) || aSig ) {
3227
            float_raise( float_flag_invalid STATUS_VAR);
3228
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3229
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3230
            }
3231
        }
3232
        return (sbits64) LIT64( 0x8000000000000000 );
3233
    }
3234
    else if ( aExp < 0x3FFF ) {
3235
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3236
        return 0;
3237
    }
3238
    z = aSig>>( - shiftCount );
3239
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3240
        STATUS(float_exception_flags) |= float_flag_inexact;
3241
    }
3242
    if ( aSign ) z = - z;
3243
    return z;
3244

    
3245
}
3246

    
3247
/*----------------------------------------------------------------------------
3248
| Returns the result of converting the extended double-precision floating-
3249
| point value `a' to the single-precision floating-point format.  The
3250
| conversion is performed according to the IEC/IEEE Standard for Binary
3251
| Floating-Point Arithmetic.
3252
*----------------------------------------------------------------------------*/
3253

    
3254
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3255
{
3256
    flag aSign;
3257
    int32 aExp;
3258
    bits64 aSig;
3259

    
3260
    aSig = extractFloatx80Frac( a );
3261
    aExp = extractFloatx80Exp( a );
3262
    aSign = extractFloatx80Sign( a );
3263
    if ( aExp == 0x7FFF ) {
3264
        if ( (bits64) ( aSig<<1 ) ) {
3265
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3266
        }
3267
        return packFloat32( aSign, 0xFF, 0 );
3268
    }
3269
    shift64RightJamming( aSig, 33, &aSig );
3270
    if ( aExp || aSig ) aExp -= 0x3F81;
3271
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3272

    
3273
}
3274

    
3275
/*----------------------------------------------------------------------------
3276
| Returns the result of converting the extended double-precision floating-
3277
| point value `a' to the double-precision floating-point format.  The
3278
| conversion is performed according to the IEC/IEEE Standard for Binary
3279
| Floating-Point Arithmetic.
3280
*----------------------------------------------------------------------------*/
3281

    
3282
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3283
{
3284
    flag aSign;
3285
    int32 aExp;
3286
    bits64 aSig, zSig;
3287

    
3288
    aSig = extractFloatx80Frac( a );
3289
    aExp = extractFloatx80Exp( a );
3290
    aSign = extractFloatx80Sign( a );
3291
    if ( aExp == 0x7FFF ) {
3292
        if ( (bits64) ( aSig<<1 ) ) {
3293
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3294
        }
3295
        return packFloat64( aSign, 0x7FF, 0 );
3296
    }
3297
    shift64RightJamming( aSig, 1, &zSig );
3298
    if ( aExp || aSig ) aExp -= 0x3C01;
3299
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3300

    
3301
}
3302

    
3303
#ifdef FLOAT128
3304

    
3305
/*----------------------------------------------------------------------------
3306
| Returns the result of converting the extended double-precision floating-
3307
| point value `a' to the quadruple-precision floating-point format.  The
3308
| conversion is performed according to the IEC/IEEE Standard for Binary
3309
| Floating-Point Arithmetic.
3310
*----------------------------------------------------------------------------*/
3311

    
3312
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3313
{
3314
    flag aSign;
3315
    int16 aExp;
3316
    bits64 aSig, zSig0, zSig1;
3317

    
3318
    aSig = extractFloatx80Frac( a );
3319
    aExp = extractFloatx80Exp( a );
3320
    aSign = extractFloatx80Sign( a );
3321
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3322
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3323
    }
3324
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3325
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3326

    
3327
}
3328

    
3329
#endif
3330

    
3331
/*----------------------------------------------------------------------------
3332
| Rounds the extended double-precision floating-point value `a' to an integer,
3333
| and returns the result as an extended quadruple-precision floating-point
3334
| value.  The operation is performed according to the IEC/IEEE Standard for
3335
| Binary Floating-Point Arithmetic.
3336
*----------------------------------------------------------------------------*/
3337

    
3338
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3339
{
3340
    flag aSign;
3341
    int32 aExp;
3342
    bits64 lastBitMask, roundBitsMask;
3343
    int8 roundingMode;
3344
    floatx80 z;
3345

    
3346
    aExp = extractFloatx80Exp( a );
3347
    if ( 0x403E <= aExp ) {
3348
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3349
            return propagateFloatx80NaN( a, a STATUS_VAR );
3350
        }
3351
        return a;
3352
    }
3353
    if ( aExp < 0x3FFF ) {
3354
        if (    ( aExp == 0 )
3355
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3356
            return a;
3357
        }
3358
        STATUS(float_exception_flags) |= float_flag_inexact;
3359
        aSign = extractFloatx80Sign( a );
3360
        switch ( STATUS(float_rounding_mode) ) {
3361
         case float_round_nearest_even:
3362
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3363
               ) {
3364
                return
3365
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366
            }
3367
            break;
3368
         case float_round_down:
3369
            return
3370
                  aSign ?
3371
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3372
                : packFloatx80( 0, 0, 0 );
3373
         case float_round_up:
3374
            return
3375
                  aSign ? packFloatx80( 1, 0, 0 )
3376
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3377
        }
3378
        return packFloatx80( aSign, 0, 0 );
3379
    }
3380
    lastBitMask = 1;
3381
    lastBitMask <<= 0x403E - aExp;
3382
    roundBitsMask = lastBitMask - 1;
3383
    z = a;
3384
    roundingMode = STATUS(float_rounding_mode);
3385
    if ( roundingMode == float_round_nearest_even ) {
3386
        z.low += lastBitMask>>1;
3387
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3388
    }
3389
    else if ( roundingMode != float_round_to_zero ) {
3390
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3391
            z.low += roundBitsMask;
3392
        }
3393
    }
3394
    z.low &= ~ roundBitsMask;
3395
    if ( z.low == 0 ) {
3396
        ++z.high;
3397
        z.low = LIT64( 0x8000000000000000 );
3398
    }
3399
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3400
    return z;
3401

    
3402
}
3403

    
3404
/*----------------------------------------------------------------------------
3405
| Returns the result of adding the absolute values of the extended double-
3406
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3407
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3408
| The addition is performed according to the IEC/IEEE Standard for Binary
3409
| Floating-Point Arithmetic.
3410
*----------------------------------------------------------------------------*/
3411

    
3412
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3413
{
3414
    int32 aExp, bExp, zExp;
3415
    bits64 aSig, bSig, zSig0, zSig1;
3416
    int32 expDiff;
3417

    
3418
    aSig = extractFloatx80Frac( a );
3419
    aExp = extractFloatx80Exp( a );
3420
    bSig = extractFloatx80Frac( b );
3421
    bExp = extractFloatx80Exp( b );
3422
    expDiff = aExp - bExp;
3423
    if ( 0 < expDiff ) {
3424
        if ( aExp == 0x7FFF ) {
3425
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3426
            return a;
3427
        }
3428
        if ( bExp == 0 ) --expDiff;
3429
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3430
        zExp = aExp;
3431
    }
3432
    else if ( expDiff < 0 ) {
3433
        if ( bExp == 0x7FFF ) {
3434
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3435
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3436
        }
3437
        if ( aExp == 0 ) ++expDiff;
3438
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3439
        zExp = bExp;
3440
    }
3441
    else {
3442
        if ( aExp == 0x7FFF ) {
3443
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3444
                return propagateFloatx80NaN( a, b STATUS_VAR );
3445
            }
3446
            return a;
3447
        }
3448
        zSig1 = 0;
3449
        zSig0 = aSig + bSig;
3450
        if ( aExp == 0 ) {
3451
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3452
            goto roundAndPack;
3453
        }
3454
        zExp = aExp;
3455
        goto shiftRight1;
3456
    }
3457
    zSig0 = aSig + bSig;
3458
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3459
 shiftRight1:
3460
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3461
    zSig0 |= LIT64( 0x8000000000000000 );
3462
    ++zExp;
3463
 roundAndPack:
3464
    return
3465
        roundAndPackFloatx80(
3466
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3467

    
3468
}
3469

    
3470
/*----------------------------------------------------------------------------
3471
| Returns the result of subtracting the absolute values of the extended
3472
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3473
| difference is negated before being returned.  `zSign' is ignored if the
3474
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3475
| Standard for Binary Floating-Point Arithmetic.
3476
*----------------------------------------------------------------------------*/
3477

    
3478
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3479
{
3480
    int32 aExp, bExp, zExp;
3481
    bits64 aSig, bSig, zSig0, zSig1;
3482
    int32 expDiff;
3483
    floatx80 z;
3484

    
3485
    aSig = extractFloatx80Frac( a );
3486
    aExp = extractFloatx80Exp( a );
3487
    bSig = extractFloatx80Frac( b );
3488
    bExp = extractFloatx80Exp( b );
3489
    expDiff = aExp - bExp;
3490
    if ( 0 < expDiff ) goto aExpBigger;
3491
    if ( expDiff < 0 ) goto bExpBigger;
3492
    if ( aExp == 0x7FFF ) {
3493
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3494
            return propagateFloatx80NaN( a, b STATUS_VAR );
3495
        }
3496
        float_raise( float_flag_invalid STATUS_VAR);
3497
        z.low = floatx80_default_nan_low;
3498
        z.high = floatx80_default_nan_high;
3499
        return z;
3500
    }
3501
    if ( aExp == 0 ) {
3502
        aExp = 1;
3503
        bExp = 1;
3504
    }
3505
    zSig1 = 0;
3506
    if ( bSig < aSig ) goto aBigger;
3507
    if ( aSig < bSig ) goto bBigger;
3508
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3509
 bExpBigger:
3510
    if ( bExp == 0x7FFF ) {
3511
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3512
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3513
    }
3514
    if ( aExp == 0 ) ++expDiff;
3515
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3516
 bBigger:
3517
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3518
    zExp = bExp;
3519
    zSign ^= 1;
3520
    goto normalizeRoundAndPack;
3521
 aExpBigger:
3522
    if ( aExp == 0x7FFF ) {
3523
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3524
        return a;
3525
    }
3526
    if ( bExp == 0 ) --expDiff;
3527
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3528
 aBigger:
3529
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3530
    zExp = aExp;
3531
 normalizeRoundAndPack:
3532
    return
3533
        normalizeRoundAndPackFloatx80(
3534
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3535

    
3536
}
3537

    
3538
/*----------------------------------------------------------------------------
3539
| Returns the result of adding the extended double-precision floating-point
3540
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3541
| Standard for Binary Floating-Point Arithmetic.
3542
*----------------------------------------------------------------------------*/
3543

    
3544
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3545
{
3546
    flag aSign, bSign;
3547

    
3548
    aSign = extractFloatx80Sign( a );
3549
    bSign = extractFloatx80Sign( b );
3550
    if ( aSign == bSign ) {
3551
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3552
    }
3553
    else {
3554
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3555
    }
3556

    
3557
}
3558

    
3559
/*----------------------------------------------------------------------------
3560
| Returns the result of subtracting the extended double-precision floating-
3561
| point values `a' and `b'.  The operation is performed according to the
3562
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3563
*----------------------------------------------------------------------------*/
3564

    
3565
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3566
{
3567
    flag aSign, bSign;
3568

    
3569
    aSign = extractFloatx80Sign( a );
3570
    bSign = extractFloatx80Sign( b );
3571
    if ( aSign == bSign ) {
3572
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3573
    }
3574
    else {
3575
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3576
    }
3577

    
3578
}
3579

    
3580
/*----------------------------------------------------------------------------
3581
| Returns the result of multiplying the extended double-precision floating-
3582
| point values `a' and `b'.  The operation is performed according to the
3583
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3584
*----------------------------------------------------------------------------*/
3585

    
3586
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3587
{
3588
    flag aSign, bSign, zSign;
3589
    int32 aExp, bExp, zExp;
3590
    bits64 aSig, bSig, zSig0, zSig1;
3591
    floatx80 z;
3592

    
3593
    aSig = extractFloatx80Frac( a );
3594
    aExp = extractFloatx80Exp( a );
3595
    aSign = extractFloatx80Sign( a );
3596
    bSig = extractFloatx80Frac( b );
3597
    bExp = extractFloatx80Exp( b );
3598
    bSign = extractFloatx80Sign( b );
3599
    zSign = aSign ^ bSign;
3600
    if ( aExp == 0x7FFF ) {
3601
        if (    (bits64) ( aSig<<1 )
3602
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3603
            return propagateFloatx80NaN( a, b STATUS_VAR );
3604
        }
3605
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3606
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607
    }
3608
    if ( bExp == 0x7FFF ) {
3609
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3610
        if ( ( aExp | aSig ) == 0 ) {
3611
 invalid:
3612
            float_raise( float_flag_invalid STATUS_VAR);
3613
            z.low = floatx80_default_nan_low;
3614
            z.high = floatx80_default_nan_high;
3615
            return z;
3616
        }
3617
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3618
    }
3619
    if ( aExp == 0 ) {
3620
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3621
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3622
    }
3623
    if ( bExp == 0 ) {
3624
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3625
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3626
    }
3627
    zExp = aExp + bExp - 0x3FFE;
3628
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3629
    if ( 0 < (sbits64) zSig0 ) {
3630
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631
        --zExp;
3632
    }
3633
    return
3634
        roundAndPackFloatx80(
3635
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3636

    
3637
}
3638

    
3639
/*----------------------------------------------------------------------------
3640
| Returns the result of dividing the extended double-precision floating-point
3641
| value `a' by the corresponding value `b'.  The operation is performed
3642
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3643
*----------------------------------------------------------------------------*/
3644

    
3645
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3646
{
3647
    flag aSign, bSign, zSign;
3648
    int32 aExp, bExp, zExp;
3649
    bits64 aSig, bSig, zSig0, zSig1;
3650
    bits64 rem0, rem1, rem2, term0, term1, term2;
3651
    floatx80 z;
3652

    
3653
    aSig = extractFloatx80Frac( a );
3654
    aExp = extractFloatx80Exp( a );
3655
    aSign = extractFloatx80Sign( a );
3656
    bSig = extractFloatx80Frac( b );
3657
    bExp = extractFloatx80Exp( b );
3658
    bSign = extractFloatx80Sign( b );
3659
    zSign = aSign ^ bSign;
3660
    if ( aExp == 0x7FFF ) {
3661
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3662
        if ( bExp == 0x7FFF ) {
3663
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3664
            goto invalid;
3665
        }
3666
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3667
    }
3668
    if ( bExp == 0x7FFF ) {
3669
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3670
        return packFloatx80( zSign, 0, 0 );
3671
    }
3672
    if ( bExp == 0 ) {
3673
        if ( bSig == 0 ) {
3674
            if ( ( aExp | aSig ) == 0 ) {
3675
 invalid:
3676
                float_raise( float_flag_invalid STATUS_VAR);
3677
                z.low = floatx80_default_nan_low;
3678
                z.high = floatx80_default_nan_high;
3679
                return z;
3680
            }
3681
            float_raise( float_flag_divbyzero STATUS_VAR);
3682
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683
        }
3684
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3685
    }
3686
    if ( aExp == 0 ) {
3687
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3688
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3689
    }
3690
    zExp = aExp - bExp + 0x3FFE;
3691
    rem1 = 0;
3692
    if ( bSig <= aSig ) {
3693
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3694
        ++zExp;
3695
    }
3696
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3697
    mul64To128( bSig, zSig0, &term0, &term1 );
3698
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3699
    while ( (sbits64) rem0 < 0 ) {
3700
        --zSig0;
3701
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3702
    }
3703
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3704
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3705
        mul64To128( bSig, zSig1, &term1, &term2 );
3706
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3707
        while ( (sbits64) rem1 < 0 ) {
3708
            --zSig1;
3709
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3710
        }
3711
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3712
    }
3713
    return
3714
        roundAndPackFloatx80(
3715
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3716

    
3717
}
3718

    
3719
/*----------------------------------------------------------------------------
3720
| Returns the remainder of the extended double-precision floating-point value
3721
| `a' with respect to the corresponding value `b'.  The operation is performed
3722
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723
*----------------------------------------------------------------------------*/
3724

    
3725
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3726
{
3727
    flag aSign, bSign, zSign;
3728
    int32 aExp, bExp, expDiff;
3729
    bits64 aSig0, aSig1, bSig;
3730
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3731
    floatx80 z;
3732

    
3733
    aSig0 = extractFloatx80Frac( a );
3734
    aExp = extractFloatx80Exp( a );
3735
    aSign = extractFloatx80Sign( a );
3736
    bSig = extractFloatx80Frac( b );
3737
    bExp = extractFloatx80Exp( b );
3738
    bSign = extractFloatx80Sign( b );
3739
    if ( aExp == 0x7FFF ) {
3740
        if (    (bits64) ( aSig0<<1 )
3741
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3742
            return propagateFloatx80NaN( a, b STATUS_VAR );
3743
        }
3744
        goto invalid;
3745
    }
3746
    if ( bExp == 0x7FFF ) {
3747
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3748
        return a;
3749
    }
3750
    if ( bExp == 0 ) {
3751
        if ( bSig == 0 ) {
3752
 invalid:
3753
            float_raise( float_flag_invalid STATUS_VAR);
3754
            z.low = floatx80_default_nan_low;
3755
            z.high = floatx80_default_nan_high;
3756
            return z;
3757
        }
3758
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759
    }
3760
    if ( aExp == 0 ) {
3761
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3762
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3763
    }
3764
    bSig |= LIT64( 0x8000000000000000 );
3765
    zSign = aSign;
3766
    expDiff = aExp - bExp;
3767
    aSig1 = 0;
3768
    if ( expDiff < 0 ) {
3769
        if ( expDiff < -1 ) return a;
3770
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3771
        expDiff = 0;
3772
    }
3773
    q = ( bSig <= aSig0 );
3774
    if ( q ) aSig0 -= bSig;
3775
    expDiff -= 64;
3776
    while ( 0 < expDiff ) {
3777
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3778
        q = ( 2 < q ) ? q - 2 : 0;
3779
        mul64To128( bSig, q, &term0, &term1 );
3780
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3781
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3782
        expDiff -= 62;
3783
    }
3784
    expDiff += 64;
3785
    if ( 0 < expDiff ) {
3786
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3787
        q = ( 2 < q ) ? q - 2 : 0;
3788
        q >>= 64 - expDiff;
3789
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3790
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3791
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3792
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3793
            ++q;
3794
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3795
        }
3796
    }
3797
    else {
3798
        term1 = 0;
3799
        term0 = bSig;
3800
    }
3801
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3802
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3803
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3804
              && ( q & 1 ) )
3805
       ) {
3806
        aSig0 = alternateASig0;
3807
        aSig1 = alternateASig1;
3808
        zSign = ! zSign;
3809
    }
3810
    return
3811
        normalizeRoundAndPackFloatx80(
3812
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3813

    
3814
}
3815

    
3816
/*----------------------------------------------------------------------------
3817
| Returns the square root of the extended double-precision floating-point
3818
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3819
| for Binary Floating-Point Arithmetic.
3820
*----------------------------------------------------------------------------*/
3821

    
3822
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3823
{
3824
    flag aSign;
3825
    int32 aExp, zExp;
3826
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3827
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3828
    floatx80 z;
3829

    
3830
    aSig0 = extractFloatx80Frac( a );
3831
    aExp = extractFloatx80Exp( a );
3832
    aSign = extractFloatx80Sign( a );
3833
    if ( aExp == 0x7FFF ) {
3834
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3835
        if ( ! aSign ) return a;
3836
        goto invalid;
3837
    }
3838
    if ( aSign ) {
3839
        if ( ( aExp | aSig0 ) == 0 ) return a;
3840
 invalid:
3841
        float_raise( float_flag_invalid STATUS_VAR);
3842
        z.low = floatx80_default_nan_low;
3843
        z.high = floatx80_default_nan_high;
3844
        return z;
3845
    }
3846
    if ( aExp == 0 ) {
3847
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3848
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3849
    }
3850
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3851
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3852
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3853
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3854
    doubleZSig0 = zSig0<<1;
3855
    mul64To128( zSig0, zSig0, &term0, &term1 );
3856
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3857
    while ( (sbits64) rem0 < 0 ) {
3858
        --zSig0;
3859
        doubleZSig0 -= 2;
3860
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3861
    }
3862
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3863
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3864
        if ( zSig1 == 0 ) zSig1 = 1;
3865
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3866
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3867
        mul64To128( zSig1, zSig1, &term2, &term3 );
3868
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3869
        while ( (sbits64) rem1 < 0 ) {
3870
            --zSig1;
3871
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3872
            term3 |= 1;
3873
            term2 |= doubleZSig0;
3874
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3875
        }
3876
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3877
    }
3878
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3879
    zSig0 |= doubleZSig0;
3880
    return
3881
        roundAndPackFloatx80(
3882
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3883

    
3884
}
3885

    
3886
/*----------------------------------------------------------------------------
3887
| Returns 1 if the extended double-precision floating-point value `a' is
3888
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3889
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3890
| Arithmetic.
3891
*----------------------------------------------------------------------------*/
3892

    
3893
flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3894
{
3895

    
3896
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3897
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3898
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3899
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3900
       ) {
3901
        if (    floatx80_is_signaling_nan( a )
3902
             || floatx80_is_signaling_nan( b ) ) {
3903
            float_raise( float_flag_invalid STATUS_VAR);
3904
        }
3905
        return 0;
3906
    }
3907
    return
3908
           ( a.low == b.low )
3909
        && (    ( a.high == b.high )
3910
             || (    ( a.low == 0 )
3911
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3912
           );
3913

    
3914
}
3915

    
3916
/*----------------------------------------------------------------------------
3917
| Returns 1 if the extended double-precision floating-point value `a' is
3918
| less than or equal to the corresponding value `b', and 0 otherwise.  The
3919
| comparison is performed according to the IEC/IEEE Standard for Binary
3920
| Floating-Point Arithmetic.
3921
*----------------------------------------------------------------------------*/
3922

    
3923
flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3924
{
3925
    flag aSign, bSign;
3926

    
3927
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3928
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3929
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3930
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3931
       ) {
3932
        float_raise( float_flag_invalid STATUS_VAR);
3933
        return 0;
3934
    }
3935
    aSign = extractFloatx80Sign( a );
3936
    bSign = extractFloatx80Sign( b );
3937
    if ( aSign != bSign ) {
3938
        return
3939
               aSign
3940
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3941
                 == 0 );
3942
    }
3943
    return
3944
          aSign ? le128( b.high, b.low, a.high, a.low )
3945
        : le128( a.high, a.low, b.high, b.low );
3946

    
3947
}
3948

    
3949
/*----------------------------------------------------------------------------
3950
| Returns 1 if the extended double-precision floating-point value `a' is
3951
| less than the corresponding value `b', and 0 otherwise.  The comparison
3952
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3953
| Arithmetic.
3954
*----------------------------------------------------------------------------*/
3955

    
3956
flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3957
{
3958
    flag aSign, bSign;
3959

    
3960
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3961
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3962
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3963
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3964
       ) {
3965
        float_raise( float_flag_invalid STATUS_VAR);
3966
        return 0;
3967
    }
3968
    aSign = extractFloatx80Sign( a );
3969
    bSign = extractFloatx80Sign( b );
3970
    if ( aSign != bSign ) {
3971
        return
3972
               aSign
3973
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3974
                 != 0 );
3975
    }
3976
    return
3977
          aSign ? lt128( b.high, b.low, a.high, a.low )
3978
        : lt128( a.high, a.low, b.high, b.low );
3979

    
3980
}
3981

    
3982
/*----------------------------------------------------------------------------
3983
| Returns 1 if the extended double-precision floating-point value `a' is equal
3984
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
3985
| raised if either operand is a NaN.  Otherwise, the comparison is performed
3986
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3987
*----------------------------------------------------------------------------*/
3988

    
3989
flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3990
{
3991

    
3992
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3993
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3995
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996
       ) {
3997
        float_raise( float_flag_invalid STATUS_VAR);
3998
        return 0;
3999
    }
4000
    return
4001
           ( a.low == b.low )
4002
        && (    ( a.high == b.high )
4003
             || (    ( a.low == 0 )
4004
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4005
           );
4006

    
4007
}
4008

    
4009
/*----------------------------------------------------------------------------
4010
| Returns 1 if the extended double-precision floating-point value `a' is less
4011
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4012
| do not cause an exception.  Otherwise, the comparison is performed according
4013
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4014
*----------------------------------------------------------------------------*/
4015

    
4016
flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4017
{
4018
    flag aSign, bSign;
4019

    
4020
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4021
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4023
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4024
       ) {
4025
        if (    floatx80_is_signaling_nan( a )
4026
             || floatx80_is_signaling_nan( b ) ) {
4027
            float_raise( float_flag_invalid STATUS_VAR);
4028
        }
4029
        return 0;
4030
    }
4031
    aSign = extractFloatx80Sign( a );
4032
    bSign = extractFloatx80Sign( b );
4033
    if ( aSign != bSign ) {
4034
        return
4035
               aSign
4036
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4037
                 == 0 );
4038
    }
4039
    return
4040
          aSign ? le128( b.high, b.low, a.high, a.low )
4041
        : le128( a.high, a.low, b.high, b.low );
4042

    
4043
}
4044

    
4045
/*----------------------------------------------------------------------------
4046
| Returns 1 if the extended double-precision floating-point value `a' is less
4047
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4048
| an exception.  Otherwise, the comparison is performed according to the
4049
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4050
*----------------------------------------------------------------------------*/
4051

    
4052
flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4053
{
4054
    flag aSign, bSign;
4055

    
4056
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4057
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4058
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4059
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4060
       ) {
4061
        if (    floatx80_is_signaling_nan( a )
4062
             || floatx80_is_signaling_nan( b ) ) {
4063
            float_raise( float_flag_invalid STATUS_VAR);
4064
        }
4065
        return 0;
4066
    }
4067
    aSign = extractFloatx80Sign( a );
4068
    bSign = extractFloatx80Sign( b );
4069
    if ( aSign != bSign ) {
4070
        return
4071
               aSign
4072
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4073
                 != 0 );
4074
    }
4075
    return
4076
          aSign ? lt128( b.high, b.low, a.high, a.low )
4077
        : lt128( a.high, a.low, b.high, b.low );
4078

    
4079
}
4080

    
4081
#endif
4082

    
4083
#ifdef FLOAT128
4084

    
4085
/*----------------------------------------------------------------------------
4086
| Returns the result of converting the quadruple-precision floating-point
4087
| value `a' to the 32-bit two's complement integer format.  The conversion
4088
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4089
| Arithmetic---which means in particular that the conversion is rounded
4090
| according to the current rounding mode.  If `a' is a NaN, the largest
4091
| positive integer is returned.  Otherwise, if the conversion overflows, the
4092
| largest integer with the same sign as `a' is returned.
4093
*----------------------------------------------------------------------------*/
4094

    
4095
int32 float128_to_int32( float128 a STATUS_PARAM )
4096
{
4097
    flag aSign;
4098
    int32 aExp, shiftCount;
4099
    bits64 aSig0, aSig1;
4100

    
4101
    aSig1 = extractFloat128Frac1( a );
4102
    aSig0 = extractFloat128Frac0( a );
4103
    aExp = extractFloat128Exp( a );
4104
    aSign = extractFloat128Sign( a );
4105
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4106
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4107
    aSig0 |= ( aSig1 != 0 );
4108
    shiftCount = 0x4028 - aExp;
4109
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4110
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4111

    
4112
}
4113

    
4114
/*----------------------------------------------------------------------------
4115
| Returns the result of converting the quadruple-precision floating-point
4116
| value `a' to the 32-bit two's complement integer format.  The conversion
4117
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4118
| Arithmetic, except that the conversion is always rounded toward zero.  If
4119
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4120
| conversion overflows, the largest integer with the same sign as `a' is
4121
| returned.
4122
*----------------------------------------------------------------------------*/
4123

    
4124
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4125
{
4126
    flag aSign;
4127
    int32 aExp, shiftCount;
4128
    bits64 aSig0, aSig1, savedASig;
4129
    int32 z;
4130

    
4131
    aSig1 = extractFloat128Frac1( a );
4132
    aSig0 = extractFloat128Frac0( a );
4133
    aExp = extractFloat128Exp( a );
4134
    aSign = extractFloat128Sign( a );
4135
    aSig0 |= ( aSig1 != 0 );
4136
    if ( 0x401E < aExp ) {
4137
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4138
        goto invalid;
4139
    }
4140
    else if ( aExp < 0x3FFF ) {
4141
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4142
        return 0;
4143
    }
4144
    aSig0 |= LIT64( 0x0001000000000000 );
4145
    shiftCount = 0x402F - aExp;
4146
    savedASig = aSig0;
4147
    aSig0 >>= shiftCount;
4148
    z = aSig0;
4149
    if ( aSign ) z = - z;
4150
    if ( ( z < 0 ) ^ aSign ) {
4151
 invalid:
4152
        float_raise( float_flag_invalid STATUS_VAR);
4153
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4154
    }
4155
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4156
        STATUS(float_exception_flags) |= float_flag_inexact;
4157
    }
4158
    return z;
4159

    
4160
}
4161

    
4162
/*----------------------------------------------------------------------------
4163
| Returns the result of converting the quadruple-precision floating-point
4164
| value `a' to the 64-bit two's complement integer format.  The conversion
4165
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4166
| Arithmetic---which means in particular that the conversion is rounded
4167
| according to the current rounding mode.  If `a' is a NaN, the largest
4168
| positive integer is returned.  Otherwise, if the conversion overflows, the
4169
| largest integer with the same sign as `a' is returned.
4170
*----------------------------------------------------------------------------*/
4171

    
4172
int64 float128_to_int64( float128 a STATUS_PARAM )
4173
{
4174
    flag aSign;
4175
    int32 aExp, shiftCount;
4176
    bits64 aSig0, aSig1;
4177

    
4178
    aSig1 = extractFloat128Frac1( a );
4179
    aSig0 = extractFloat128Frac0( a );
4180
    aExp = extractFloat128Exp( a );
4181
    aSign = extractFloat128Sign( a );
4182
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4183
    shiftCount = 0x402F - aExp;
4184
    if ( shiftCount <= 0 ) {
4185
        if ( 0x403E < aExp ) {
4186
            float_raise( float_flag_invalid STATUS_VAR);
4187
            if (    ! aSign
4188
                 || (    ( aExp == 0x7FFF )
4189
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4190
                    )
4191
               ) {
4192
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4193
            }
4194
            return (sbits64) LIT64( 0x8000000000000000 );
4195
        }
4196
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4197
    }
4198
    else {
4199
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4200
    }
4201
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4202

    
4203
}
4204

    
4205
/*----------------------------------------------------------------------------
4206
| Returns the result of converting the quadruple-precision floating-point
4207
| value `a' to the 64-bit two's complement integer format.  The conversion
4208
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4209
| Arithmetic, except that the conversion is always rounded toward zero.
4210
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4211
| the conversion overflows, the largest integer with the same sign as `a' is
4212
| returned.
4213
*----------------------------------------------------------------------------*/
4214

    
4215
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4216
{
4217
    flag aSign;
4218
    int32 aExp, shiftCount;
4219
    bits64 aSig0, aSig1;
4220
    int64 z;
4221

    
4222
    aSig1 = extractFloat128Frac1( a );
4223
    aSig0 = extractFloat128Frac0( a );
4224
    aExp = extractFloat128Exp( a );
4225
    aSign = extractFloat128Sign( a );
4226
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4227
    shiftCount = aExp - 0x402F;
4228
    if ( 0 < shiftCount ) {
4229
        if ( 0x403E <= aExp ) {
4230
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4231
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4232
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4233
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4234
            }
4235
            else {
4236
                float_raise( float_flag_invalid STATUS_VAR);
4237
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4238
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4239
                }
4240
            }
4241
            return (sbits64) LIT64( 0x8000000000000000 );
4242
        }
4243
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4244
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4245
            STATUS(float_exception_flags) |= float_flag_inexact;
4246
        }
4247
    }
4248
    else {
4249
        if ( aExp < 0x3FFF ) {
4250
            if ( aExp | aSig0 | aSig1 ) {
4251
                STATUS(float_exception_flags) |= float_flag_inexact;
4252
            }
4253
            return 0;
4254
        }
4255
        z = aSig0>>( - shiftCount );
4256
        if (    aSig1
4257
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4258
            STATUS(float_exception_flags) |= float_flag_inexact;
4259
        }
4260
    }
4261
    if ( aSign ) z = - z;
4262
    return z;
4263

    
4264
}
4265

    
4266
/*----------------------------------------------------------------------------
4267
| Returns the result of converting the quadruple-precision floating-point
4268
| value `a' to the single-precision floating-point format.  The conversion
4269
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4270
| Arithmetic.
4271
*----------------------------------------------------------------------------*/
4272

    
4273
float32 float128_to_float32( float128 a STATUS_PARAM )
4274
{
4275
    flag aSign;
4276
    int32 aExp;
4277
    bits64 aSig0, aSig1;
4278
    bits32 zSig;
4279

    
4280
    aSig1 = extractFloat128Frac1( a );
4281
    aSig0 = extractFloat128Frac0( a );
4282
    aExp = extractFloat128Exp( a );
4283
    aSign = extractFloat128Sign( a );
4284
    if ( aExp == 0x7FFF ) {
4285
        if ( aSig0 | aSig1 ) {
4286
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4287
        }
4288
        return packFloat32( aSign, 0xFF, 0 );
4289
    }
4290
    aSig0 |= ( aSig1 != 0 );
4291
    shift64RightJamming( aSig0, 18, &aSig0 );
4292
    zSig = aSig0;
4293
    if ( aExp || zSig ) {
4294
        zSig |= 0x40000000;
4295
        aExp -= 0x3F81;
4296
    }
4297
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4298

    
4299
}
4300

    
4301
/*----------------------------------------------------------------------------
4302
| Returns the result of converting the quadruple-precision floating-point
4303
| value `a' to the double-precision floating-point format.  The conversion
4304
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305
| Arithmetic.
4306
*----------------------------------------------------------------------------*/
4307

    
4308
float64 float128_to_float64( float128 a STATUS_PARAM )
4309
{
4310
    flag aSign;
4311
    int32 aExp;
4312
    bits64 aSig0, aSig1;
4313

    
4314
    aSig1 = extractFloat128Frac1( a );
4315
    aSig0 = extractFloat128Frac0( a );
4316
    aExp = extractFloat128Exp( a );
4317
    aSign = extractFloat128Sign( a );
4318
    if ( aExp == 0x7FFF ) {
4319
        if ( aSig0 | aSig1 ) {
4320
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4321
        }
4322
        return packFloat64( aSign, 0x7FF, 0 );
4323
    }
4324
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4325
    aSig0 |= ( aSig1 != 0 );
4326
    if ( aExp || aSig0 ) {
4327
        aSig0 |= LIT64( 0x4000000000000000 );
4328
        aExp -= 0x3C01;
4329
    }
4330
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4331

    
4332
}
4333

    
4334
#ifdef FLOATX80
4335

    
4336
/*----------------------------------------------------------------------------
4337
| Returns the result of converting the quadruple-precision floating-point
4338
| value `a' to the extended double-precision floating-point format.  The
4339
| conversion is performed according to the IEC/IEEE Standard for Binary
4340
| Floating-Point Arithmetic.
4341
*----------------------------------------------------------------------------*/
4342

    
4343
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4344
{
4345
    flag aSign;
4346
    int32 aExp;
4347
    bits64 aSig0, aSig1;
4348

    
4349
    aSig1 = extractFloat128Frac1( a );
4350
    aSig0 = extractFloat128Frac0( a );
4351
    aExp = extractFloat128Exp( a );
4352
    aSign = extractFloat128Sign( a );
4353
    if ( aExp == 0x7FFF ) {
4354
        if ( aSig0 | aSig1 ) {
4355
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4356
        }
4357
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4358
    }
4359
    if ( aExp == 0 ) {
4360
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4361
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4362
    }
4363
    else {
4364
        aSig0 |= LIT64( 0x0001000000000000 );
4365
    }
4366
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4367
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4368

    
4369
}
4370

    
4371
#endif
4372

    
4373
/*----------------------------------------------------------------------------
4374
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4375
| returns the result as a quadruple-precision floating-point value.  The
4376
| operation is performed according to the IEC/IEEE Standard for Binary
4377
| Floating-Point Arithmetic.
4378
*----------------------------------------------------------------------------*/
4379

    
4380
float128 float128_round_to_int( float128 a STATUS_PARAM )
4381
{
4382
    flag aSign;
4383
    int32 aExp;
4384
    bits64 lastBitMask, roundBitsMask;
4385
    int8 roundingMode;
4386
    float128 z;
4387

    
4388
    aExp = extractFloat128Exp( a );
4389
    if ( 0x402F <= aExp ) {
4390
        if ( 0x406F <= aExp ) {
4391
            if (    ( aExp == 0x7FFF )
4392
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4393
               ) {
4394
                return propagateFloat128NaN( a, a STATUS_VAR );
4395
            }
4396
            return a;
4397
        }
4398
        lastBitMask = 1;
4399
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4400
        roundBitsMask = lastBitMask - 1;
4401
        z = a;
4402
        roundingMode = STATUS(float_rounding_mode);
4403
        if ( roundingMode == float_round_nearest_even ) {
4404
            if ( lastBitMask ) {
4405
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4406
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4407
            }
4408
            else {
4409
                if ( (sbits64) z.low < 0 ) {
4410
                    ++z.high;
4411
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4412
                }
4413
            }
4414
        }
4415
        else if ( roundingMode != float_round_to_zero ) {
4416
            if (   extractFloat128Sign( z )
4417
                 ^ ( roundingMode == float_round_up ) ) {
4418
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4419
            }
4420
        }
4421
        z.low &= ~ roundBitsMask;
4422
    }
4423
    else {
4424
        if ( aExp < 0x3FFF ) {
4425
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4426
            STATUS(float_exception_flags) |= float_flag_inexact;
4427
            aSign = extractFloat128Sign( a );
4428
            switch ( STATUS(float_rounding_mode) ) {
4429
             case float_round_nearest_even:
4430
                if (    ( aExp == 0x3FFE )
4431
                     && (   extractFloat128Frac0( a )
4432
                          | extractFloat128Frac1( a ) )
4433
                   ) {
4434
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4435
                }
4436
                break;
4437
             case float_round_down:
4438
                return
4439
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4440
                    : packFloat128( 0, 0, 0, 0 );
4441
             case float_round_up:
4442
                return
4443
                      aSign ? packFloat128( 1, 0, 0, 0 )
4444
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4445
            }
4446
            return packFloat128( aSign, 0, 0, 0 );
4447
        }
4448
        lastBitMask = 1;
4449
        lastBitMask <<= 0x402F - aExp;
4450
        roundBitsMask = lastBitMask - 1;
4451
        z.low = 0;
4452
        z.high = a.high;
4453
        roundingMode = STATUS(float_rounding_mode);
4454
        if ( roundingMode == float_round_nearest_even ) {
4455
            z.high += lastBitMask>>1;
4456
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4457
                z.high &= ~ lastBitMask;
4458
            }
4459
        }
4460
        else if ( roundingMode != float_round_to_zero ) {
4461
            if (   extractFloat128Sign( z )
4462
                 ^ ( roundingMode == float_round_up ) ) {
4463
                z.high |= ( a.low != 0 );
4464
                z.high += roundBitsMask;
4465
            }
4466
        }
4467
        z.high &= ~ roundBitsMask;
4468
    }
4469
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4470
        STATUS(float_exception_flags) |= float_flag_inexact;
4471
    }
4472
    return z;
4473

    
4474
}
4475

    
4476
/*----------------------------------------------------------------------------
4477
| Returns the result of adding the absolute values of the quadruple-precision
4478
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4479
| before being returned.  `zSign' is ignored if the result is a NaN.
4480
| The addition is performed according to the IEC/IEEE Standard for Binary
4481
| Floating-Point Arithmetic.
4482
*----------------------------------------------------------------------------*/
4483

    
4484
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4485
{
4486
    int32 aExp, bExp, zExp;
4487
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4488
    int32 expDiff;
4489

    
4490
    aSig1 = extractFloat128Frac1( a );
4491
    aSig0 = extractFloat128Frac0( a );
4492
    aExp = extractFloat128Exp( a );
4493
    bSig1 = extractFloat128Frac1( b );
4494
    bSig0 = extractFloat128Frac0( b );
4495
    bExp = extractFloat128Exp( b );
4496
    expDiff = aExp - bExp;
4497
    if ( 0 < expDiff ) {
4498
        if ( aExp == 0x7FFF ) {
4499
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4500
            return a;
4501
        }
4502
        if ( bExp == 0 ) {
4503
            --expDiff;
4504
        }
4505
        else {
4506
            bSig0 |= LIT64( 0x0001000000000000 );
4507
        }
4508
        shift128ExtraRightJamming(
4509
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4510
        zExp = aExp;
4511
    }
4512
    else if ( expDiff < 0 ) {
4513
        if ( bExp == 0x7FFF ) {
4514
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4515
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4516
        }
4517
        if ( aExp == 0 ) {
4518
            ++expDiff;
4519
        }
4520
        else {
4521
            aSig0 |= LIT64( 0x0001000000000000 );
4522
        }
4523
        shift128ExtraRightJamming(
4524
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4525
        zExp = bExp;
4526
    }
4527
    else {
4528
        if ( aExp == 0x7FFF ) {
4529
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4530
                return propagateFloat128NaN( a, b STATUS_VAR );
4531
            }
4532
            return a;
4533
        }
4534
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4535
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4536
        zSig2 = 0;
4537
        zSig0 |= LIT64( 0x0002000000000000 );
4538
        zExp = aExp;
4539
        goto shiftRight1;
4540
    }
4541
    aSig0 |= LIT64( 0x0001000000000000 );
4542
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4543
    --zExp;
4544
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4545
    ++zExp;
4546
 shiftRight1:
4547
    shift128ExtraRightJamming(
4548
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4549
 roundAndPack:
4550
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4551

    
4552
}
4553

    
4554
/*----------------------------------------------------------------------------
4555
| Returns the result of subtracting the absolute values of the quadruple-
4556
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4557
| difference is negated before being returned.  `zSign' is ignored if the
4558
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4559
| Standard for Binary Floating-Point Arithmetic.
4560
*----------------------------------------------------------------------------*/
4561

    
4562
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4563
{
4564
    int32 aExp, bExp, zExp;
4565
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4566
    int32 expDiff;
4567
    float128 z;
4568

    
4569
    aSig1 = extractFloat128Frac1( a );
4570
    aSig0 = extractFloat128Frac0( a );
4571
    aExp = extractFloat128Exp( a );
4572
    bSig1 = extractFloat128Frac1( b );
4573
    bSig0 = extractFloat128Frac0( b );
4574
    bExp = extractFloat128Exp( b );
4575
    expDiff = aExp - bExp;
4576
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4577
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4578
    if ( 0 < expDiff ) goto aExpBigger;
4579
    if ( expDiff < 0 ) goto bExpBigger;
4580
    if ( aExp == 0x7FFF ) {
4581
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4582
            return propagateFloat128NaN( a, b STATUS_VAR );
4583
        }
4584
        float_raise( float_flag_invalid STATUS_VAR);
4585
        z.low = float128_default_nan_low;
4586
        z.high = float128_default_nan_high;
4587
        return z;
4588
    }
4589
    if ( aExp == 0 ) {
4590
        aExp = 1;
4591
        bExp = 1;
4592
    }
4593
    if ( bSig0 < aSig0 ) goto aBigger;
4594
    if ( aSig0 < bSig0 ) goto bBigger;
4595
    if ( bSig1 < aSig1 ) goto aBigger;
4596
    if ( aSig1 < bSig1 ) goto bBigger;
4597
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4598
 bExpBigger:
4599
    if ( bExp == 0x7FFF ) {
4600
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4601
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4602
    }
4603
    if ( aExp == 0 ) {
4604
        ++expDiff;
4605
    }
4606
    else {
4607
        aSig0 |= LIT64( 0x4000000000000000 );
4608
    }
4609
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4610
    bSig0 |= LIT64( 0x4000000000000000 );
4611
 bBigger:
4612
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4613
    zExp = bExp;
4614
    zSign ^= 1;
4615
    goto normalizeRoundAndPack;
4616
 aExpBigger:
4617
    if ( aExp == 0x7FFF ) {
4618
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4619
        return a;
4620
    }
4621
    if ( bExp == 0 ) {
4622
        --expDiff;
4623
    }
4624
    else {
4625
        bSig0 |= LIT64( 0x4000000000000000 );
4626
    }
4627
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4628
    aSig0 |= LIT64( 0x4000000000000000 );
4629
 aBigger:
4630
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4631
    zExp = aExp;
4632
 normalizeRoundAndPack:
4633
    --zExp;
4634
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4635

    
4636
}
4637

    
4638
/*----------------------------------------------------------------------------
4639
| Returns the result of adding the quadruple-precision floating-point values
4640
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4641
| for Binary Floating-Point Arithmetic.
4642
*----------------------------------------------------------------------------*/
4643

    
4644
float128 float128_add( float128 a, float128 b STATUS_PARAM )
4645
{
4646
    flag aSign, bSign;
4647

    
4648
    aSign = extractFloat128Sign( a );
4649
    bSign = extractFloat128Sign( b );
4650
    if ( aSign == bSign ) {
4651
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4652
    }
4653
    else {
4654
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4655
    }
4656

    
4657
}
4658

    
4659
/*----------------------------------------------------------------------------
4660
| Returns the result of subtracting the quadruple-precision floating-point
4661
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4662
| Standard for Binary Floating-Point Arithmetic.
4663
*----------------------------------------------------------------------------*/
4664

    
4665
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4666
{
4667
    flag aSign, bSign;
4668

    
4669
    aSign = extractFloat128Sign( a );
4670
    bSign = extractFloat128Sign( b );
4671
    if ( aSign == bSign ) {
4672
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4673
    }
4674
    else {
4675
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4676
    }
4677

    
4678
}
4679

    
4680
/*----------------------------------------------------------------------------
4681
| Returns the result of multiplying the quadruple-precision floating-point
4682
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4683
| Standard for Binary Floating-Point Arithmetic.
4684
*----------------------------------------------------------------------------*/
4685

    
4686
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4687
{
4688
    flag aSign, bSign, zSign;
4689
    int32 aExp, bExp, zExp;
4690
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4691
    float128 z;
4692

    
4693
    aSig1 = extractFloat128Frac1( a );
4694
    aSig0 = extractFloat128Frac0( a );
4695
    aExp = extractFloat128Exp( a );
4696
    aSign = extractFloat128Sign( a );
4697
    bSig1 = extractFloat128Frac1( b );
4698
    bSig0 = extractFloat128Frac0( b );
4699
    bExp = extractFloat128Exp( b );
4700
    bSign = extractFloat128Sign( b );
4701
    zSign = aSign ^ bSign;
4702
    if ( aExp == 0x7FFF ) {
4703
        if (    ( aSig0 | aSig1 )
4704
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4705
            return propagateFloat128NaN( a, b STATUS_VAR );
4706
        }
4707
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4708
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4709
    }
4710
    if ( bExp == 0x7FFF ) {
4711
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4712
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4713
 invalid:
4714
            float_raise( float_flag_invalid STATUS_VAR);
4715
            z.low = float128_default_nan_low;
4716
            z.high = float128_default_nan_high;
4717
            return z;
4718
        }
4719
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4720
    }
4721
    if ( aExp == 0 ) {
4722
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4723
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4724
    }
4725
    if ( bExp == 0 ) {
4726
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4727
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4728
    }
4729
    zExp = aExp + bExp - 0x4000;
4730
    aSig0 |= LIT64( 0x0001000000000000 );
4731
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4732
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4733
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4734
    zSig2 |= ( zSig3 != 0 );
4735
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4736
        shift128ExtraRightJamming(
4737
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4738
        ++zExp;
4739
    }
4740
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4741

    
4742
}
4743

    
4744
/*----------------------------------------------------------------------------
4745
| Returns the result of dividing the quadruple-precision floating-point value
4746
| `a' by the corresponding value `b'.  The operation is performed according to
4747
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4748
*----------------------------------------------------------------------------*/
4749

    
4750
float128 float128_div( float128 a, float128 b STATUS_PARAM )
4751
{
4752
    flag aSign, bSign, zSign;
4753
    int32 aExp, bExp, zExp;
4754
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4755
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4756
    float128 z;
4757

    
4758
    aSig1 = extractFloat128Frac1( a );
4759
    aSig0 = extractFloat128Frac0( a );
4760
    aExp = extractFloat128Exp( a );
4761
    aSign = extractFloat128Sign( a );
4762
    bSig1 = extractFloat128Frac1( b );
4763
    bSig0 = extractFloat128Frac0( b );
4764
    bExp = extractFloat128Exp( b );
4765
    bSign = extractFloat128Sign( b );
4766
    zSign = aSign ^ bSign;
4767
    if ( aExp == 0x7FFF ) {
4768
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4769
        if ( bExp == 0x7FFF ) {
4770
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4771
            goto invalid;
4772
        }
4773
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4774
    }
4775
    if ( bExp == 0x7FFF ) {
4776
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777
        return packFloat128( zSign, 0, 0, 0 );
4778
    }
4779
    if ( bExp == 0 ) {
4780
        if ( ( bSig0 | bSig1 ) == 0 ) {
4781
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4782
 invalid:
4783
                float_raise( float_flag_invalid STATUS_VAR);
4784
                z.low = float128_default_nan_low;
4785
                z.high = float128_default_nan_high;
4786
                return z;
4787
            }
4788
            float_raise( float_flag_divbyzero STATUS_VAR);
4789
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4790
        }
4791
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4792
    }
4793
    if ( aExp == 0 ) {
4794
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4795
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4796
    }
4797
    zExp = aExp - bExp + 0x3FFD;
4798
    shortShift128Left(
4799
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4800
    shortShift128Left(
4801
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4802
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4803
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4804
        ++zExp;
4805
    }
4806
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4807
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4808
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4809
    while ( (sbits64) rem0 < 0 ) {
4810
        --zSig0;
4811
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4812
    }
4813
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4814
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4815
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4816
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4817
        while ( (sbits64) rem1 < 0 ) {
4818
            --zSig1;
4819
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4820
        }
4821
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4822
    }
4823
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4824
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4825

    
4826
}
4827

    
4828
/*----------------------------------------------------------------------------
4829
| Returns the remainder of the quadruple-precision floating-point value `a'
4830
| with respect to the corresponding value `b'.  The operation is performed
4831
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4832
*----------------------------------------------------------------------------*/
4833

    
4834
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4835
{
4836
    flag aSign, bSign, zSign;
4837
    int32 aExp, bExp, expDiff;
4838
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4839
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4840
    sbits64 sigMean0;
4841
    float128 z;
4842

    
4843
    aSig1 = extractFloat128Frac1( a );
4844
    aSig0 = extractFloat128Frac0( a );
4845
    aExp = extractFloat128Exp( a );
4846
    aSign = extractFloat128Sign( a );
4847
    bSig1 = extractFloat128Frac1( b );
4848
    bSig0 = extractFloat128Frac0( b );
4849
    bExp = extractFloat128Exp( b );
4850
    bSign = extractFloat128Sign( b );
4851
    if ( aExp == 0x7FFF ) {
4852
        if (    ( aSig0 | aSig1 )
4853
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4854
            return propagateFloat128NaN( a, b STATUS_VAR );
4855
        }
4856
        goto invalid;
4857
    }
4858
    if ( bExp == 0x7FFF ) {
4859
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4860
        return a;
4861
    }
4862
    if ( bExp == 0 ) {
4863
        if ( ( bSig0 | bSig1 ) == 0 ) {
4864
 invalid:
4865
            float_raise( float_flag_invalid STATUS_VAR);
4866
            z.low = float128_default_nan_low;
4867
            z.high = float128_default_nan_high;
4868
            return z;
4869
        }
4870
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4871
    }
4872
    if ( aExp == 0 ) {
4873
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4874
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4875
    }
4876
    expDiff = aExp - bExp;
4877
    if ( expDiff < -1 ) return a;
4878
    shortShift128Left(
4879
        aSig0 | LIT64( 0x0001000000000000 ),
4880
        aSig1,
4881
        15 - ( expDiff < 0 ),
4882
        &aSig0,
4883
        &aSig1
4884
    );
4885
    shortShift128Left(
4886
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4887
    q = le128( bSig0, bSig1, aSig0, aSig1 );
4888
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4889
    expDiff -= 64;
4890
    while ( 0 < expDiff ) {
4891
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4892
        q = ( 4 < q ) ? q - 4 : 0;
4893
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4894
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4895
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4896
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4897
        expDiff -= 61;
4898
    }
4899
    if ( -64 < expDiff ) {
4900
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4901
        q = ( 4 < q ) ? q - 4 : 0;
4902
        q >>= - expDiff;
4903
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904
        expDiff += 52;
4905
        if ( expDiff < 0 ) {
4906
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4907
        }
4908
        else {
4909
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4910
        }
4911
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4912
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4913
    }
4914
    else {
4915
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4916
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4917
    }
4918
    do {
4919
        alternateASig0 = aSig0;
4920
        alternateASig1 = aSig1;
4921
        ++q;
4922
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4923
    } while ( 0 <= (sbits64) aSig0 );
4924
    add128(
4925
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4926
    if (    ( sigMean0 < 0 )
4927
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4928
        aSig0 = alternateASig0;
4929
        aSig1 = alternateASig1;
4930
    }
4931
    zSign = ( (sbits64) aSig0 < 0 );
4932
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4933
    return
4934
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4935

    
4936
}
4937

    
4938
/*----------------------------------------------------------------------------
4939
| Returns the square root of the quadruple-precision floating-point value `a'.
4940
| The operation is performed according to the IEC/IEEE Standard for Binary
4941
| Floating-Point Arithmetic.
4942
*----------------------------------------------------------------------------*/
4943

    
4944
float128 float128_sqrt( float128 a STATUS_PARAM )
4945
{
4946
    flag aSign;
4947
    int32 aExp, zExp;
4948
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4949
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4950
    float128 z;
4951

    
4952
    aSig1 = extractFloat128Frac1( a );
4953
    aSig0 = extractFloat128Frac0( a );
4954
    aExp = extractFloat128Exp( a );
4955
    aSign = extractFloat128Sign( a );
4956
    if ( aExp == 0x7FFF ) {
4957
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4958
        if ( ! aSign ) return a;
4959
        goto invalid;
4960
    }
4961
    if ( aSign ) {
4962
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4963
 invalid:
4964
        float_raise( float_flag_invalid STATUS_VAR);
4965
        z.low = float128_default_nan_low;
4966
        z.high = float128_default_nan_high;
4967
        return z;
4968
    }
4969
    if ( aExp == 0 ) {
4970
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4971
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4972
    }
4973
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4974
    aSig0 |= LIT64( 0x0001000000000000 );
4975
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4976
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4977
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4978
    doubleZSig0 = zSig0<<1;
4979
    mul64To128( zSig0, zSig0, &term0, &term1 );
4980
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4981
    while ( (sbits64) rem0 < 0 ) {
4982
        --zSig0;
4983
        doubleZSig0 -= 2;
4984
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4985
    }
4986
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4987
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4988
        if ( zSig1 == 0 ) zSig1 = 1;
4989
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4990
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4991
        mul64To128( zSig1, zSig1, &term2, &term3 );
4992
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4993
        while ( (sbits64) rem1 < 0 ) {
4994
            --zSig1;
4995
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4996
            term3 |= 1;
4997
            term2 |= doubleZSig0;
4998
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4999
        }
5000
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5001
    }
5002
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5003
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5004

    
5005
}
5006

    
5007
/*----------------------------------------------------------------------------
5008
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5009
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5010
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011
*----------------------------------------------------------------------------*/
5012

    
5013
flag float128_eq( float128 a, float128 b STATUS_PARAM )
5014
{
5015

    
5016
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5017
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5018
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5019
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5020
       ) {
5021
        if (    float128_is_signaling_nan( a )
5022
             || float128_is_signaling_nan( b ) ) {
5023
            float_raise( float_flag_invalid STATUS_VAR);
5024
        }
5025
        return 0;
5026
    }
5027
    return
5028
           ( a.low == b.low )
5029
        && (    ( a.high == b.high )
5030
             || (    ( a.low == 0 )
5031
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5032
           );
5033

    
5034
}
5035

    
5036
/*----------------------------------------------------------------------------
5037
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5038
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5039
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5040
| Arithmetic.
5041
*----------------------------------------------------------------------------*/
5042

    
5043
flag float128_le( float128 a, float128 b STATUS_PARAM )
5044
{
5045
    flag aSign, bSign;
5046

    
5047
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5048
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5049
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5050
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5051
       ) {
5052
        float_raise( float_flag_invalid STATUS_VAR);
5053
        return 0;
5054
    }
5055
    aSign = extractFloat128Sign( a );
5056
    bSign = extractFloat128Sign( b );
5057
    if ( aSign != bSign ) {
5058
        return
5059
               aSign
5060
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5061
                 == 0 );
5062
    }
5063
    return
5064
          aSign ? le128( b.high, b.low, a.high, a.low )
5065
        : le128( a.high, a.low, b.high, b.low );
5066

    
5067
}
5068

    
5069
/*----------------------------------------------------------------------------
5070
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5071
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5072
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073
*----------------------------------------------------------------------------*/
5074

    
5075
flag float128_lt( float128 a, float128 b STATUS_PARAM )
5076
{
5077
    flag aSign, bSign;
5078

    
5079
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5080
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5081
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5082
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5083
       ) {
5084
        float_raise( float_flag_invalid STATUS_VAR);
5085
        return 0;
5086
    }
5087
    aSign = extractFloat128Sign( a );
5088
    bSign = extractFloat128Sign( b );
5089
    if ( aSign != bSign ) {
5090
        return
5091
               aSign
5092
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5093
                 != 0 );
5094
    }
5095
    return
5096
          aSign ? lt128( b.high, b.low, a.high, a.low )
5097
        : lt128( a.high, a.low, b.high, b.low );
5098

    
5099
}
5100

    
5101
/*----------------------------------------------------------------------------
5102
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5103
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5104
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5105
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106
*----------------------------------------------------------------------------*/
5107

    
5108
flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5109
{
5110

    
5111
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5112
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5113
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5114
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5115
       ) {
5116
        float_raise( float_flag_invalid STATUS_VAR);
5117
        return 0;
5118
    }
5119
    return
5120
           ( a.low == b.low )
5121
        && (    ( a.high == b.high )
5122
             || (    ( a.low == 0 )
5123
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5124
           );
5125

    
5126
}
5127

    
5128
/*----------------------------------------------------------------------------
5129
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5130
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5131
| cause an exception.  Otherwise, the comparison is performed according to the
5132
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5133
*----------------------------------------------------------------------------*/
5134

    
5135
flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5136
{
5137
    flag aSign, bSign;
5138

    
5139
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5140
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5141
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5142
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5143
       ) {
5144
        if (    float128_is_signaling_nan( a )
5145
             || float128_is_signaling_nan( b ) ) {
5146
            float_raise( float_flag_invalid STATUS_VAR);
5147
        }
5148
        return 0;
5149
    }
5150
    aSign = extractFloat128Sign( a );
5151
    bSign = extractFloat128Sign( b );
5152
    if ( aSign != bSign ) {
5153
        return
5154
               aSign
5155
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5156
                 == 0 );
5157
    }
5158
    return
5159
          aSign ? le128( b.high, b.low, a.high, a.low )
5160
        : le128( a.high, a.low, b.high, b.low );
5161

    
5162
}
5163

    
5164
/*----------------------------------------------------------------------------
5165
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5166
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5167
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5168
| Standard for Binary Floating-Point Arithmetic.
5169
*----------------------------------------------------------------------------*/
5170

    
5171
flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5172
{
5173
    flag aSign, bSign;
5174

    
5175
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5176
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5177
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5178
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5179
       ) {
5180
        if (    float128_is_signaling_nan( a )
5181
             || float128_is_signaling_nan( b ) ) {
5182
            float_raise( float_flag_invalid STATUS_VAR);
5183
        }
5184
        return 0;
5185
    }
5186
    aSign = extractFloat128Sign( a );
5187
    bSign = extractFloat128Sign( b );
5188
    if ( aSign != bSign ) {
5189
        return
5190
               aSign
5191
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5192
                 != 0 );
5193
    }
5194
    return
5195
          aSign ? lt128( b.high, b.low, a.high, a.low )
5196
        : lt128( a.high, a.low, b.high, b.low );
5197

    
5198
}
5199

    
5200
#endif
5201

    
5202
/* misc functions */
5203
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5204
{
5205
    return int64_to_float32(a STATUS_VAR);
5206
}
5207

    
5208
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5209
{
5210
    return int64_to_float64(a STATUS_VAR);
5211
}
5212

    
5213
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5214
{
5215
    int64_t v;
5216
    unsigned int res;
5217

    
5218
    v = float32_to_int64(a STATUS_VAR);
5219
    if (v < 0) {
5220
        res = 0;
5221
        float_raise( float_flag_invalid STATUS_VAR);
5222
    } else if (v > 0xffffffff) {
5223
        res = 0xffffffff;
5224
        float_raise( float_flag_invalid STATUS_VAR);
5225
    } else {
5226
        res = v;
5227
    }
5228
    return res;
5229
}
5230

    
5231
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5232
{
5233
    int64_t v;
5234
    unsigned int res;
5235

    
5236
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5237
    if (v < 0) {
5238
        res = 0;
5239
        float_raise( float_flag_invalid STATUS_VAR);
5240
    } else if (v > 0xffffffff) {
5241
        res = 0xffffffff;
5242
        float_raise( float_flag_invalid STATUS_VAR);
5243
    } else {
5244
        res = v;
5245
    }
5246
    return res;
5247
}
5248

    
5249
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5250
{
5251
    int64_t v;
5252
    unsigned int res;
5253

    
5254
    v = float64_to_int64(a STATUS_VAR);
5255
    if (v < 0) {
5256
        res = 0;
5257
        float_raise( float_flag_invalid STATUS_VAR);
5258
    } else if (v > 0xffffffff) {
5259
        res = 0xffffffff;
5260
        float_raise( float_flag_invalid STATUS_VAR);
5261
    } else {
5262
        res = v;
5263
    }
5264
    return res;
5265
}
5266

    
5267
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5268
{
5269
    int64_t v;
5270
    unsigned int res;
5271

    
5272
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5273
    if (v < 0) {
5274
        res = 0;
5275
        float_raise( float_flag_invalid STATUS_VAR);
5276
    } else if (v > 0xffffffff) {
5277
        res = 0xffffffff;
5278
        float_raise( float_flag_invalid STATUS_VAR);
5279
    } else {
5280
        res = v;
5281
    }
5282
    return res;
5283
}
5284

    
5285
#define COMPARE(s, nan_exp)                                                  \
5286
INLINE char float ## s ## _compare_internal( float ## s a, float ## s b,     \
5287
                                      int is_quiet STATUS_PARAM )            \
5288
{                                                                            \
5289
    flag aSign, bSign;                                                       \
5290
                                                                             \
5291
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5292
         extractFloat ## s ## Frac( a ) ) ||                                 \
5293
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5294
          extractFloat ## s ## Frac( b ) )) {                                \
5295
        if (!is_quiet ||                                                     \
5296
            float ## s ## _is_signaling_nan( a ) ||                          \
5297
            float ## s ## _is_signaling_nan( b ) ) {                         \
5298
            float_raise( float_flag_invalid STATUS_VAR);                     \
5299
        }                                                                    \
5300
        return float_relation_unordered;                                     \
5301
    }                                                                        \
5302
    aSign = extractFloat ## s ## Sign( a );                                  \
5303
    bSign = extractFloat ## s ## Sign( b );                                  \
5304
    if ( aSign != bSign ) {                                                  \
5305
        if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) {                           \
5306
            /* zero case */                                                  \
5307
            return float_relation_equal;                                     \
5308
        } else {                                                             \
5309
            return 1 - (2 * aSign);                                          \
5310
        }                                                                    \
5311
    } else {                                                                 \
5312
        if (a == b) {                                                        \
5313
            return float_relation_equal;                                     \
5314
        } else {                                                             \
5315
            return 1 - 2 * (aSign ^ ( a < b ));                              \
5316
        }                                                                    \
5317
    }                                                                        \
5318
}                                                                            \
5319
                                                                             \
5320
char float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )       \
5321
{                                                                            \
5322
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5323
}                                                                            \
5324
                                                                             \
5325
char float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5326
{                                                                            \
5327
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5328
}
5329

    
5330
COMPARE(32, 0xff)
5331
COMPARE(64, 0x7ff)