Statistics
| Branch: | Revision:

root / fpu / softfloat.c @ 83f64091

History | View | Annotate | Download (188.2 kB)

1

    
2
/*============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5
Package, Release 2b.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15
arithmetic/SoftFloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
18
been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19
RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20
AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21
COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22
EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23
INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24
OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25

26
Derivative works are acceptable, even for commercial purposes, so long as
27
(1) the source code for the derivative work includes prominent notice that
28
the work is derivative, and (2) the source code includes prominent notice with
29
these four paragraphs for those parts of this code that are retained.
30

31
=============================================================================*/
32

    
33
#include "softfloat.h"
34

    
35
/*----------------------------------------------------------------------------
36
| Primitive arithmetic functions, including multi-word arithmetic, and
37
| division and square root approximations.  (Can be specialized to target if
38
| desired.)
39
*----------------------------------------------------------------------------*/
40
#include "softfloat-macros.h"
41

    
42
/*----------------------------------------------------------------------------
43
| Functions and definitions to determine:  (1) whether tininess for underflow
44
| is detected before or after rounding by default, (2) what (if anything)
45
| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46
| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47
| are propagated from function inputs to output.  These details are target-
48
| specific.
49
*----------------------------------------------------------------------------*/
50
#include "softfloat-specialize.h"
51

    
52
void set_float_rounding_mode(int val STATUS_PARAM)
53
{
54
    STATUS(float_rounding_mode) = val;
55
}
56

    
57
void set_float_exception_flags(int val STATUS_PARAM)
58
{
59
    STATUS(float_exception_flags) = val;
60
}
61

    
62
#ifdef FLOATX80
63
void set_floatx80_rounding_precision(int val STATUS_PARAM)
64
{
65
    STATUS(floatx80_rounding_precision) = val;
66
}
67
#endif
68

    
69
/*----------------------------------------------------------------------------
70
| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71
| and 7, and returns the properly rounded 32-bit integer corresponding to the
72
| input.  If `zSign' is 1, the input is negated before being converted to an
73
| integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
74
| is simply rounded to an integer, with the inexact exception raised if the
75
| input cannot be represented exactly as an integer.  However, if the fixed-
76
| point input is too large, the invalid exception is raised and the largest
77
| positive or negative integer is returned.
78
*----------------------------------------------------------------------------*/
79

    
80
static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81
{
82
    int8 roundingMode;
83
    flag roundNearestEven;
84
    int8 roundIncrement, roundBits;
85
    int32 z;
86

    
87
    roundingMode = STATUS(float_rounding_mode);
88
    roundNearestEven = ( roundingMode == float_round_nearest_even );
89
    roundIncrement = 0x40;
90
    if ( ! roundNearestEven ) {
91
        if ( roundingMode == float_round_to_zero ) {
92
            roundIncrement = 0;
93
        }
94
        else {
95
            roundIncrement = 0x7F;
96
            if ( zSign ) {
97
                if ( roundingMode == float_round_up ) roundIncrement = 0;
98
            }
99
            else {
100
                if ( roundingMode == float_round_down ) roundIncrement = 0;
101
            }
102
        }
103
    }
104
    roundBits = absZ & 0x7F;
105
    absZ = ( absZ + roundIncrement )>>7;
106
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107
    z = absZ;
108
    if ( zSign ) z = - z;
109
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110
        float_raise( float_flag_invalid STATUS_VAR);
111
        return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112
    }
113
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114
    return z;
115

    
116
}
117

    
118
/*----------------------------------------------------------------------------
119
| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120
| `absZ1', with binary point between bits 63 and 64 (between the input words),
121
| and returns the properly rounded 64-bit integer corresponding to the input.
122
| If `zSign' is 1, the input is negated before being converted to an integer.
123
| Ordinarily, the fixed-point input is simply rounded to an integer, with
124
| the inexact exception raised if the input cannot be represented exactly as
125
| an integer.  However, if the fixed-point input is too large, the invalid
126
| exception is raised and the largest positive or negative integer is
127
| returned.
128
*----------------------------------------------------------------------------*/
129

    
130
static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131
{
132
    int8 roundingMode;
133
    flag roundNearestEven, increment;
134
    int64 z;
135

    
136
    roundingMode = STATUS(float_rounding_mode);
137
    roundNearestEven = ( roundingMode == float_round_nearest_even );
138
    increment = ( (sbits64) absZ1 < 0 );
139
    if ( ! roundNearestEven ) {
140
        if ( roundingMode == float_round_to_zero ) {
141
            increment = 0;
142
        }
143
        else {
144
            if ( zSign ) {
145
                increment = ( roundingMode == float_round_down ) && absZ1;
146
            }
147
            else {
148
                increment = ( roundingMode == float_round_up ) && absZ1;
149
            }
150
        }
151
    }
152
    if ( increment ) {
153
        ++absZ0;
154
        if ( absZ0 == 0 ) goto overflow;
155
        absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156
    }
157
    z = absZ0;
158
    if ( zSign ) z = - z;
159
    if ( z && ( ( z < 0 ) ^ zSign ) ) {
160
 overflow:
161
        float_raise( float_flag_invalid STATUS_VAR);
162
        return
163
              zSign ? (sbits64) LIT64( 0x8000000000000000 )
164
            : LIT64( 0x7FFFFFFFFFFFFFFF );
165
    }
166
    if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167
    return z;
168

    
169
}
170

    
171
/*----------------------------------------------------------------------------
172
| Returns the fraction bits of the single-precision floating-point value `a'.
173
*----------------------------------------------------------------------------*/
174

    
175
INLINE bits32 extractFloat32Frac( float32 a )
176
{
177

    
178
    return a & 0x007FFFFF;
179

    
180
}
181

    
182
/*----------------------------------------------------------------------------
183
| Returns the exponent bits of the single-precision floating-point value `a'.
184
*----------------------------------------------------------------------------*/
185

    
186
INLINE int16 extractFloat32Exp( float32 a )
187
{
188

    
189
    return ( a>>23 ) & 0xFF;
190

    
191
}
192

    
193
/*----------------------------------------------------------------------------
194
| Returns the sign bit of the single-precision floating-point value `a'.
195
*----------------------------------------------------------------------------*/
196

    
197
INLINE flag extractFloat32Sign( float32 a )
198
{
199

    
200
    return a>>31;
201

    
202
}
203

    
204
/*----------------------------------------------------------------------------
205
| Normalizes the subnormal single-precision floating-point value represented
206
| by the denormalized significand `aSig'.  The normalized exponent and
207
| significand are stored at the locations pointed to by `zExpPtr' and
208
| `zSigPtr', respectively.
209
*----------------------------------------------------------------------------*/
210

    
211
static void
212
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213
{
214
    int8 shiftCount;
215

    
216
    shiftCount = countLeadingZeros32( aSig ) - 8;
217
    *zSigPtr = aSig<<shiftCount;
218
    *zExpPtr = 1 - shiftCount;
219

    
220
}
221

    
222
/*----------------------------------------------------------------------------
223
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224
| single-precision floating-point value, returning the result.  After being
225
| shifted into the proper positions, the three fields are simply added
226
| together to form the result.  This means that any integer portion of `zSig'
227
| will be added into the exponent.  Since a properly normalized significand
228
| will have an integer portion equal to 1, the `zExp' input should be 1 less
229
| than the desired result exponent whenever `zSig' is a complete, normalized
230
| significand.
231
*----------------------------------------------------------------------------*/
232

    
233
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234
{
235

    
236
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237

    
238
}
239

    
240
/*----------------------------------------------------------------------------
241
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242
| and significand `zSig', and returns the proper single-precision floating-
243
| point value corresponding to the abstract input.  Ordinarily, the abstract
244
| value is simply rounded and packed into the single-precision format, with
245
| the inexact exception raised if the abstract input cannot be represented
246
| exactly.  However, if the abstract value is too large, the overflow and
247
| inexact exceptions are raised and an infinity or maximal finite value is
248
| returned.  If the abstract value is too small, the input value is rounded to
249
| a subnormal number, and the underflow and inexact exceptions are raised if
250
| the abstract input cannot be represented exactly as a subnormal single-
251
| precision floating-point number.
252
|     The input significand `zSig' has its binary point between bits 30
253
| and 29, which is 7 bits to the left of the usual location.  This shifted
254
| significand must be normalized or smaller.  If `zSig' is not normalized,
255
| `zExp' must be 0; in that case, the result returned is a subnormal number,
256
| and it must not require rounding.  In the usual case that `zSig' is
257
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258
| The handling of underflow and overflow follows the IEC/IEEE Standard for
259
| Binary Floating-Point Arithmetic.
260
*----------------------------------------------------------------------------*/
261

    
262
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263
{
264
    int8 roundingMode;
265
    flag roundNearestEven;
266
    int8 roundIncrement, roundBits;
267
    flag isTiny;
268

    
269
    roundingMode = STATUS(float_rounding_mode);
270
    roundNearestEven = ( roundingMode == float_round_nearest_even );
271
    roundIncrement = 0x40;
272
    if ( ! roundNearestEven ) {
273
        if ( roundingMode == float_round_to_zero ) {
274
            roundIncrement = 0;
275
        }
276
        else {
277
            roundIncrement = 0x7F;
278
            if ( zSign ) {
279
                if ( roundingMode == float_round_up ) roundIncrement = 0;
280
            }
281
            else {
282
                if ( roundingMode == float_round_down ) roundIncrement = 0;
283
            }
284
        }
285
    }
286
    roundBits = zSig & 0x7F;
287
    if ( 0xFD <= (bits16) zExp ) {
288
        if (    ( 0xFD < zExp )
289
             || (    ( zExp == 0xFD )
290
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291
           ) {
292
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
293
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294
        }
295
        if ( zExp < 0 ) {
296
            isTiny =
297
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298
                || ( zExp < -1 )
299
                || ( zSig + roundIncrement < 0x80000000 );
300
            shift32RightJamming( zSig, - zExp, &zSig );
301
            zExp = 0;
302
            roundBits = zSig & 0x7F;
303
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304
        }
305
    }
306
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
307
    zSig = ( zSig + roundIncrement )>>7;
308
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309
    if ( zSig == 0 ) zExp = 0;
310
    return packFloat32( zSign, zExp, zSig );
311

    
312
}
313

    
314
/*----------------------------------------------------------------------------
315
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316
| and significand `zSig', and returns the proper single-precision floating-
317
| point value corresponding to the abstract input.  This routine is just like
318
| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319
| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320
| floating-point exponent.
321
*----------------------------------------------------------------------------*/
322

    
323
static float32
324
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325
{
326
    int8 shiftCount;
327

    
328
    shiftCount = countLeadingZeros32( zSig ) - 1;
329
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330

    
331
}
332

    
333
/*----------------------------------------------------------------------------
334
| Returns the fraction bits of the double-precision floating-point value `a'.
335
*----------------------------------------------------------------------------*/
336

    
337
INLINE bits64 extractFloat64Frac( float64 a )
338
{
339

    
340
    return a & LIT64( 0x000FFFFFFFFFFFFF );
341

    
342
}
343

    
344
/*----------------------------------------------------------------------------
345
| Returns the exponent bits of the double-precision floating-point value `a'.
346
*----------------------------------------------------------------------------*/
347

    
348
INLINE int16 extractFloat64Exp( float64 a )
349
{
350

    
351
    return ( a>>52 ) & 0x7FF;
352

    
353
}
354

    
355
/*----------------------------------------------------------------------------
356
| Returns the sign bit of the double-precision floating-point value `a'.
357
*----------------------------------------------------------------------------*/
358

    
359
INLINE flag extractFloat64Sign( float64 a )
360
{
361

    
362
    return a>>63;
363

    
364
}
365

    
366
/*----------------------------------------------------------------------------
367
| Normalizes the subnormal double-precision floating-point value represented
368
| by the denormalized significand `aSig'.  The normalized exponent and
369
| significand are stored at the locations pointed to by `zExpPtr' and
370
| `zSigPtr', respectively.
371
*----------------------------------------------------------------------------*/
372

    
373
static void
374
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
375
{
376
    int8 shiftCount;
377

    
378
    shiftCount = countLeadingZeros64( aSig ) - 11;
379
    *zSigPtr = aSig<<shiftCount;
380
    *zExpPtr = 1 - shiftCount;
381

    
382
}
383

    
384
/*----------------------------------------------------------------------------
385
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386
| double-precision floating-point value, returning the result.  After being
387
| shifted into the proper positions, the three fields are simply added
388
| together to form the result.  This means that any integer portion of `zSig'
389
| will be added into the exponent.  Since a properly normalized significand
390
| will have an integer portion equal to 1, the `zExp' input should be 1 less
391
| than the desired result exponent whenever `zSig' is a complete, normalized
392
| significand.
393
*----------------------------------------------------------------------------*/
394

    
395
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396
{
397

    
398
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399

    
400
}
401

    
402
/*----------------------------------------------------------------------------
403
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404
| and significand `zSig', and returns the proper double-precision floating-
405
| point value corresponding to the abstract input.  Ordinarily, the abstract
406
| value is simply rounded and packed into the double-precision format, with
407
| the inexact exception raised if the abstract input cannot be represented
408
| exactly.  However, if the abstract value is too large, the overflow and
409
| inexact exceptions are raised and an infinity or maximal finite value is
410
| returned.  If the abstract value is too small, the input value is rounded
411
| to a subnormal number, and the underflow and inexact exceptions are raised
412
| if the abstract input cannot be represented exactly as a subnormal double-
413
| precision floating-point number.
414
|     The input significand `zSig' has its binary point between bits 62
415
| and 61, which is 10 bits to the left of the usual location.  This shifted
416
| significand must be normalized or smaller.  If `zSig' is not normalized,
417
| `zExp' must be 0; in that case, the result returned is a subnormal number,
418
| and it must not require rounding.  In the usual case that `zSig' is
419
| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420
| The handling of underflow and overflow follows the IEC/IEEE Standard for
421
| Binary Floating-Point Arithmetic.
422
*----------------------------------------------------------------------------*/
423

    
424
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425
{
426
    int8 roundingMode;
427
    flag roundNearestEven;
428
    int16 roundIncrement, roundBits;
429
    flag isTiny;
430

    
431
    roundingMode = STATUS(float_rounding_mode);
432
    roundNearestEven = ( roundingMode == float_round_nearest_even );
433
    roundIncrement = 0x200;
434
    if ( ! roundNearestEven ) {
435
        if ( roundingMode == float_round_to_zero ) {
436
            roundIncrement = 0;
437
        }
438
        else {
439
            roundIncrement = 0x3FF;
440
            if ( zSign ) {
441
                if ( roundingMode == float_round_up ) roundIncrement = 0;
442
            }
443
            else {
444
                if ( roundingMode == float_round_down ) roundIncrement = 0;
445
            }
446
        }
447
    }
448
    roundBits = zSig & 0x3FF;
449
    if ( 0x7FD <= (bits16) zExp ) {
450
        if (    ( 0x7FD < zExp )
451
             || (    ( zExp == 0x7FD )
452
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453
           ) {
454
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
455
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456
        }
457
        if ( zExp < 0 ) {
458
            isTiny =
459
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460
                || ( zExp < -1 )
461
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462
            shift64RightJamming( zSig, - zExp, &zSig );
463
            zExp = 0;
464
            roundBits = zSig & 0x3FF;
465
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466
        }
467
    }
468
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
469
    zSig = ( zSig + roundIncrement )>>10;
470
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471
    if ( zSig == 0 ) zExp = 0;
472
    return packFloat64( zSign, zExp, zSig );
473

    
474
}
475

    
476
/*----------------------------------------------------------------------------
477
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478
| and significand `zSig', and returns the proper double-precision floating-
479
| point value corresponding to the abstract input.  This routine is just like
480
| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481
| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482
| floating-point exponent.
483
*----------------------------------------------------------------------------*/
484

    
485
static float64
486
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487
{
488
    int8 shiftCount;
489

    
490
    shiftCount = countLeadingZeros64( zSig ) - 1;
491
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492

    
493
}
494

    
495
#ifdef FLOATX80
496

    
497
/*----------------------------------------------------------------------------
498
| Returns the fraction bits of the extended double-precision floating-point
499
| value `a'.
500
*----------------------------------------------------------------------------*/
501

    
502
INLINE bits64 extractFloatx80Frac( floatx80 a )
503
{
504

    
505
    return a.low;
506

    
507
}
508

    
509
/*----------------------------------------------------------------------------
510
| Returns the exponent bits of the extended double-precision floating-point
511
| value `a'.
512
*----------------------------------------------------------------------------*/
513

    
514
INLINE int32 extractFloatx80Exp( floatx80 a )
515
{
516

    
517
    return a.high & 0x7FFF;
518

    
519
}
520

    
521
/*----------------------------------------------------------------------------
522
| Returns the sign bit of the extended double-precision floating-point value
523
| `a'.
524
*----------------------------------------------------------------------------*/
525

    
526
INLINE flag extractFloatx80Sign( floatx80 a )
527
{
528

    
529
    return a.high>>15;
530

    
531
}
532

    
533
/*----------------------------------------------------------------------------
534
| Normalizes the subnormal extended double-precision floating-point value
535
| represented by the denormalized significand `aSig'.  The normalized exponent
536
| and significand are stored at the locations pointed to by `zExpPtr' and
537
| `zSigPtr', respectively.
538
*----------------------------------------------------------------------------*/
539

    
540
static void
541
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
542
{
543
    int8 shiftCount;
544

    
545
    shiftCount = countLeadingZeros64( aSig );
546
    *zSigPtr = aSig<<shiftCount;
547
    *zExpPtr = 1 - shiftCount;
548

    
549
}
550

    
551
/*----------------------------------------------------------------------------
552
| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553
| extended double-precision floating-point value, returning the result.
554
*----------------------------------------------------------------------------*/
555

    
556
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557
{
558
    floatx80 z;
559

    
560
    z.low = zSig;
561
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562
    return z;
563

    
564
}
565

    
566
/*----------------------------------------------------------------------------
567
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568
| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569
| and returns the proper extended double-precision floating-point value
570
| corresponding to the abstract input.  Ordinarily, the abstract value is
571
| rounded and packed into the extended double-precision format, with the
572
| inexact exception raised if the abstract input cannot be represented
573
| exactly.  However, if the abstract value is too large, the overflow and
574
| inexact exceptions are raised and an infinity or maximal finite value is
575
| returned.  If the abstract value is too small, the input value is rounded to
576
| a subnormal number, and the underflow and inexact exceptions are raised if
577
| the abstract input cannot be represented exactly as a subnormal extended
578
| double-precision floating-point number.
579
|     If `roundingPrecision' is 32 or 64, the result is rounded to the same
580
| number of bits as single or double precision, respectively.  Otherwise, the
581
| result is rounded to the full precision of the extended double-precision
582
| format.
583
|     The input significand must be normalized or smaller.  If the input
584
| significand is not normalized, `zExp' must be 0; in that case, the result
585
| returned is a subnormal number, and it must not require rounding.  The
586
| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587
| Floating-Point Arithmetic.
588
*----------------------------------------------------------------------------*/
589

    
590
static floatx80
591
 roundAndPackFloatx80(
592
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593
 STATUS_PARAM)
594
{
595
    int8 roundingMode;
596
    flag roundNearestEven, increment, isTiny;
597
    int64 roundIncrement, roundMask, roundBits;
598

    
599
    roundingMode = STATUS(float_rounding_mode);
600
    roundNearestEven = ( roundingMode == float_round_nearest_even );
601
    if ( roundingPrecision == 80 ) goto precision80;
602
    if ( roundingPrecision == 64 ) {
603
        roundIncrement = LIT64( 0x0000000000000400 );
604
        roundMask = LIT64( 0x00000000000007FF );
605
    }
606
    else if ( roundingPrecision == 32 ) {
607
        roundIncrement = LIT64( 0x0000008000000000 );
608
        roundMask = LIT64( 0x000000FFFFFFFFFF );
609
    }
610
    else {
611
        goto precision80;
612
    }
613
    zSig0 |= ( zSig1 != 0 );
614
    if ( ! roundNearestEven ) {
615
        if ( roundingMode == float_round_to_zero ) {
616
            roundIncrement = 0;
617
        }
618
        else {
619
            roundIncrement = roundMask;
620
            if ( zSign ) {
621
                if ( roundingMode == float_round_up ) roundIncrement = 0;
622
            }
623
            else {
624
                if ( roundingMode == float_round_down ) roundIncrement = 0;
625
            }
626
        }
627
    }
628
    roundBits = zSig0 & roundMask;
629
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630
        if (    ( 0x7FFE < zExp )
631
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632
           ) {
633
            goto overflow;
634
        }
635
        if ( zExp <= 0 ) {
636
            isTiny =
637
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638
                || ( zExp < 0 )
639
                || ( zSig0 <= zSig0 + roundIncrement );
640
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641
            zExp = 0;
642
            roundBits = zSig0 & roundMask;
643
            if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644
            if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645
            zSig0 += roundIncrement;
646
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
647
            roundIncrement = roundMask + 1;
648
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649
                roundMask |= roundIncrement;
650
            }
651
            zSig0 &= ~ roundMask;
652
            return packFloatx80( zSign, zExp, zSig0 );
653
        }
654
    }
655
    if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
656
    zSig0 += roundIncrement;
657
    if ( zSig0 < roundIncrement ) {
658
        ++zExp;
659
        zSig0 = LIT64( 0x8000000000000000 );
660
    }
661
    roundIncrement = roundMask + 1;
662
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663
        roundMask |= roundIncrement;
664
    }
665
    zSig0 &= ~ roundMask;
666
    if ( zSig0 == 0 ) zExp = 0;
667
    return packFloatx80( zSign, zExp, zSig0 );
668
 precision80:
669
    increment = ( (sbits64) zSig1 < 0 );
670
    if ( ! roundNearestEven ) {
671
        if ( roundingMode == float_round_to_zero ) {
672
            increment = 0;
673
        }
674
        else {
675
            if ( zSign ) {
676
                increment = ( roundingMode == float_round_down ) && zSig1;
677
            }
678
            else {
679
                increment = ( roundingMode == float_round_up ) && zSig1;
680
            }
681
        }
682
    }
683
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684
        if (    ( 0x7FFE < zExp )
685
             || (    ( zExp == 0x7FFE )
686
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687
                  && increment
688
                )
689
           ) {
690
            roundMask = 0;
691
 overflow:
692
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
693
            if (    ( roundingMode == float_round_to_zero )
694
                 || ( zSign && ( roundingMode == float_round_up ) )
695
                 || ( ! zSign && ( roundingMode == float_round_down ) )
696
               ) {
697
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698
            }
699
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700
        }
701
        if ( zExp <= 0 ) {
702
            isTiny =
703
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704
                || ( zExp < 0 )
705
                || ! increment
706
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708
            zExp = 0;
709
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710
            if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
711
            if ( roundNearestEven ) {
712
                increment = ( (sbits64) zSig1 < 0 );
713
            }
714
            else {
715
                if ( zSign ) {
716
                    increment = ( roundingMode == float_round_down ) && zSig1;
717
                }
718
                else {
719
                    increment = ( roundingMode == float_round_up ) && zSig1;
720
                }
721
            }
722
            if ( increment ) {
723
                ++zSig0;
724
                zSig0 &=
725
                    ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
727
            }
728
            return packFloatx80( zSign, zExp, zSig0 );
729
        }
730
    }
731
    if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
732
    if ( increment ) {
733
        ++zSig0;
734
        if ( zSig0 == 0 ) {
735
            ++zExp;
736
            zSig0 = LIT64( 0x8000000000000000 );
737
        }
738
        else {
739
            zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740
        }
741
    }
742
    else {
743
        if ( zSig0 == 0 ) zExp = 0;
744
    }
745
    return packFloatx80( zSign, zExp, zSig0 );
746

    
747
}
748

    
749
/*----------------------------------------------------------------------------
750
| Takes an abstract floating-point value having sign `zSign', exponent
751
| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752
| and returns the proper extended double-precision floating-point value
753
| corresponding to the abstract input.  This routine is just like
754
| `roundAndPackFloatx80' except that the input significand does not have to be
755
| normalized.
756
*----------------------------------------------------------------------------*/
757

    
758
static floatx80
759
 normalizeRoundAndPackFloatx80(
760
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761
 STATUS_PARAM)
762
{
763
    int8 shiftCount;
764

    
765
    if ( zSig0 == 0 ) {
766
        zSig0 = zSig1;
767
        zSig1 = 0;
768
        zExp -= 64;
769
    }
770
    shiftCount = countLeadingZeros64( zSig0 );
771
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772
    zExp -= shiftCount;
773
    return
774
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775

    
776
}
777

    
778
#endif
779

    
780
#ifdef FLOAT128
781

    
782
/*----------------------------------------------------------------------------
783
| Returns the least-significant 64 fraction bits of the quadruple-precision
784
| floating-point value `a'.
785
*----------------------------------------------------------------------------*/
786

    
787
INLINE bits64 extractFloat128Frac1( float128 a )
788
{
789

    
790
    return a.low;
791

    
792
}
793

    
794
/*----------------------------------------------------------------------------
795
| Returns the most-significant 48 fraction bits of the quadruple-precision
796
| floating-point value `a'.
797
*----------------------------------------------------------------------------*/
798

    
799
INLINE bits64 extractFloat128Frac0( float128 a )
800
{
801

    
802
    return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803

    
804
}
805

    
806
/*----------------------------------------------------------------------------
807
| Returns the exponent bits of the quadruple-precision floating-point value
808
| `a'.
809
*----------------------------------------------------------------------------*/
810

    
811
INLINE int32 extractFloat128Exp( float128 a )
812
{
813

    
814
    return ( a.high>>48 ) & 0x7FFF;
815

    
816
}
817

    
818
/*----------------------------------------------------------------------------
819
| Returns the sign bit of the quadruple-precision floating-point value `a'.
820
*----------------------------------------------------------------------------*/
821

    
822
INLINE flag extractFloat128Sign( float128 a )
823
{
824

    
825
    return a.high>>63;
826

    
827
}
828

    
829
/*----------------------------------------------------------------------------
830
| Normalizes the subnormal quadruple-precision floating-point value
831
| represented by the denormalized significand formed by the concatenation of
832
| `aSig0' and `aSig1'.  The normalized exponent is stored at the location
833
| pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
834
| significand are stored at the location pointed to by `zSig0Ptr', and the
835
| least significant 64 bits of the normalized significand are stored at the
836
| location pointed to by `zSig1Ptr'.
837
*----------------------------------------------------------------------------*/
838

    
839
static void
840
 normalizeFloat128Subnormal(
841
     bits64 aSig0,
842
     bits64 aSig1,
843
     int32 *zExpPtr,
844
     bits64 *zSig0Ptr,
845
     bits64 *zSig1Ptr
846
 )
847
{
848
    int8 shiftCount;
849

    
850
    if ( aSig0 == 0 ) {
851
        shiftCount = countLeadingZeros64( aSig1 ) - 15;
852
        if ( shiftCount < 0 ) {
853
            *zSig0Ptr = aSig1>>( - shiftCount );
854
            *zSig1Ptr = aSig1<<( shiftCount & 63 );
855
        }
856
        else {
857
            *zSig0Ptr = aSig1<<shiftCount;
858
            *zSig1Ptr = 0;
859
        }
860
        *zExpPtr = - shiftCount - 63;
861
    }
862
    else {
863
        shiftCount = countLeadingZeros64( aSig0 ) - 15;
864
        shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865
        *zExpPtr = 1 - shiftCount;
866
    }
867

    
868
}
869

    
870
/*----------------------------------------------------------------------------
871
| Packs the sign `zSign', the exponent `zExp', and the significand formed
872
| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873
| floating-point value, returning the result.  After being shifted into the
874
| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875
| added together to form the most significant 32 bits of the result.  This
876
| means that any integer portion of `zSig0' will be added into the exponent.
877
| Since a properly normalized significand will have an integer portion equal
878
| to 1, the `zExp' input should be 1 less than the desired result exponent
879
| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880
| significand.
881
*----------------------------------------------------------------------------*/
882

    
883
INLINE float128
884
 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885
{
886
    float128 z;
887

    
888
    z.low = zSig1;
889
    z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890
    return z;
891

    
892
}
893

    
894
/*----------------------------------------------------------------------------
895
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896
| and extended significand formed by the concatenation of `zSig0', `zSig1',
897
| and `zSig2', and returns the proper quadruple-precision floating-point value
898
| corresponding to the abstract input.  Ordinarily, the abstract value is
899
| simply rounded and packed into the quadruple-precision format, with the
900
| inexact exception raised if the abstract input cannot be represented
901
| exactly.  However, if the abstract value is too large, the overflow and
902
| inexact exceptions are raised and an infinity or maximal finite value is
903
| returned.  If the abstract value is too small, the input value is rounded to
904
| a subnormal number, and the underflow and inexact exceptions are raised if
905
| the abstract input cannot be represented exactly as a subnormal quadruple-
906
| precision floating-point number.
907
|     The input significand must be normalized or smaller.  If the input
908
| significand is not normalized, `zExp' must be 0; in that case, the result
909
| returned is a subnormal number, and it must not require rounding.  In the
910
| usual case that the input significand is normalized, `zExp' must be 1 less
911
| than the ``true'' floating-point exponent.  The handling of underflow and
912
| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913
*----------------------------------------------------------------------------*/
914

    
915
static float128
916
 roundAndPackFloat128(
917
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918
{
919
    int8 roundingMode;
920
    flag roundNearestEven, increment, isTiny;
921

    
922
    roundingMode = STATUS(float_rounding_mode);
923
    roundNearestEven = ( roundingMode == float_round_nearest_even );
924
    increment = ( (sbits64) zSig2 < 0 );
925
    if ( ! roundNearestEven ) {
926
        if ( roundingMode == float_round_to_zero ) {
927
            increment = 0;
928
        }
929
        else {
930
            if ( zSign ) {
931
                increment = ( roundingMode == float_round_down ) && zSig2;
932
            }
933
            else {
934
                increment = ( roundingMode == float_round_up ) && zSig2;
935
            }
936
        }
937
    }
938
    if ( 0x7FFD <= (bits32) zExp ) {
939
        if (    ( 0x7FFD < zExp )
940
             || (    ( zExp == 0x7FFD )
941
                  && eq128(
942
                         LIT64( 0x0001FFFFFFFFFFFF ),
943
                         LIT64( 0xFFFFFFFFFFFFFFFF ),
944
                         zSig0,
945
                         zSig1
946
                     )
947
                  && increment
948
                )
949
           ) {
950
            float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
951
            if (    ( roundingMode == float_round_to_zero )
952
                 || ( zSign && ( roundingMode == float_round_up ) )
953
                 || ( ! zSign && ( roundingMode == float_round_down ) )
954
               ) {
955
                return
956
                    packFloat128(
957
                        zSign,
958
                        0x7FFE,
959
                        LIT64( 0x0000FFFFFFFFFFFF ),
960
                        LIT64( 0xFFFFFFFFFFFFFFFF )
961
                    );
962
            }
963
            return packFloat128( zSign, 0x7FFF, 0, 0 );
964
        }
965
        if ( zExp < 0 ) {
966
            isTiny =
967
                   ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968
                || ( zExp < -1 )
969
                || ! increment
970
                || lt128(
971
                       zSig0,
972
                       zSig1,
973
                       LIT64( 0x0001FFFFFFFFFFFF ),
974
                       LIT64( 0xFFFFFFFFFFFFFFFF )
975
                   );
976
            shift128ExtraRightJamming(
977
                zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978
            zExp = 0;
979
            if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980
            if ( roundNearestEven ) {
981
                increment = ( (sbits64) zSig2 < 0 );
982
            }
983
            else {
984
                if ( zSign ) {
985
                    increment = ( roundingMode == float_round_down ) && zSig2;
986
                }
987
                else {
988
                    increment = ( roundingMode == float_round_up ) && zSig2;
989
                }
990
            }
991
        }
992
    }
993
    if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
994
    if ( increment ) {
995
        add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996
        zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997
    }
998
    else {
999
        if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000
    }
1001
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1002

    
1003
}
1004

    
1005
/*----------------------------------------------------------------------------
1006
| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007
| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008
| returns the proper quadruple-precision floating-point value corresponding
1009
| to the abstract input.  This routine is just like `roundAndPackFloat128'
1010
| except that the input significand has fewer bits and does not have to be
1011
| normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1012
| point exponent.
1013
*----------------------------------------------------------------------------*/
1014

    
1015
static float128
1016
 normalizeRoundAndPackFloat128(
1017
     flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018
{
1019
    int8 shiftCount;
1020
    bits64 zSig2;
1021

    
1022
    if ( zSig0 == 0 ) {
1023
        zSig0 = zSig1;
1024
        zSig1 = 0;
1025
        zExp -= 64;
1026
    }
1027
    shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028
    if ( 0 <= shiftCount ) {
1029
        zSig2 = 0;
1030
        shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031
    }
1032
    else {
1033
        shift128ExtraRightJamming(
1034
            zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035
    }
1036
    zExp -= shiftCount;
1037
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038

    
1039
}
1040

    
1041
#endif
1042

    
1043
/*----------------------------------------------------------------------------
1044
| Returns the result of converting the 32-bit two's complement integer `a'
1045
| to the single-precision floating-point format.  The conversion is performed
1046
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047
*----------------------------------------------------------------------------*/
1048

    
1049
float32 int32_to_float32( int32 a STATUS_PARAM )
1050
{
1051
    flag zSign;
1052

    
1053
    if ( a == 0 ) return 0;
1054
    if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055
    zSign = ( a < 0 );
1056
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057

    
1058
}
1059

    
1060
/*----------------------------------------------------------------------------
1061
| Returns the result of converting the 32-bit two's complement integer `a'
1062
| to the double-precision floating-point format.  The conversion is performed
1063
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064
*----------------------------------------------------------------------------*/
1065

    
1066
float64 int32_to_float64( int32 a STATUS_PARAM )
1067
{
1068
    flag zSign;
1069
    uint32 absA;
1070
    int8 shiftCount;
1071
    bits64 zSig;
1072

    
1073
    if ( a == 0 ) return 0;
1074
    zSign = ( a < 0 );
1075
    absA = zSign ? - a : a;
1076
    shiftCount = countLeadingZeros32( absA ) + 21;
1077
    zSig = absA;
1078
    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079

    
1080
}
1081

    
1082
#ifdef FLOATX80
1083

    
1084
/*----------------------------------------------------------------------------
1085
| Returns the result of converting the 32-bit two's complement integer `a'
1086
| to the extended double-precision floating-point format.  The conversion
1087
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088
| Arithmetic.
1089
*----------------------------------------------------------------------------*/
1090

    
1091
floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092
{
1093
    flag zSign;
1094
    uint32 absA;
1095
    int8 shiftCount;
1096
    bits64 zSig;
1097

    
1098
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099
    zSign = ( a < 0 );
1100
    absA = zSign ? - a : a;
1101
    shiftCount = countLeadingZeros32( absA ) + 32;
1102
    zSig = absA;
1103
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104

    
1105
}
1106

    
1107
#endif
1108

    
1109
#ifdef FLOAT128
1110

    
1111
/*----------------------------------------------------------------------------
1112
| Returns the result of converting the 32-bit two's complement integer `a' to
1113
| the quadruple-precision floating-point format.  The conversion is performed
1114
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115
*----------------------------------------------------------------------------*/
1116

    
1117
float128 int32_to_float128( int32 a STATUS_PARAM )
1118
{
1119
    flag zSign;
1120
    uint32 absA;
1121
    int8 shiftCount;
1122
    bits64 zSig0;
1123

    
1124
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125
    zSign = ( a < 0 );
1126
    absA = zSign ? - a : a;
1127
    shiftCount = countLeadingZeros32( absA ) + 17;
1128
    zSig0 = absA;
1129
    return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130

    
1131
}
1132

    
1133
#endif
1134

    
1135
/*----------------------------------------------------------------------------
1136
| Returns the result of converting the 64-bit two's complement integer `a'
1137
| to the single-precision floating-point format.  The conversion is performed
1138
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139
*----------------------------------------------------------------------------*/
1140

    
1141
float32 int64_to_float32( int64 a STATUS_PARAM )
1142
{
1143
    flag zSign;
1144
    uint64 absA;
1145
    int8 shiftCount;
1146

    
1147
    if ( a == 0 ) return 0;
1148
    zSign = ( a < 0 );
1149
    absA = zSign ? - a : a;
1150
    shiftCount = countLeadingZeros64( absA ) - 40;
1151
    if ( 0 <= shiftCount ) {
1152
        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153
    }
1154
    else {
1155
        shiftCount += 7;
1156
        if ( shiftCount < 0 ) {
1157
            shift64RightJamming( absA, - shiftCount, &absA );
1158
        }
1159
        else {
1160
            absA <<= shiftCount;
1161
        }
1162
        return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163
    }
1164

    
1165
}
1166

    
1167
/*----------------------------------------------------------------------------
1168
| Returns the result of converting the 64-bit two's complement integer `a'
1169
| to the double-precision floating-point format.  The conversion is performed
1170
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171
*----------------------------------------------------------------------------*/
1172

    
1173
float64 int64_to_float64( int64 a STATUS_PARAM )
1174
{
1175
    flag zSign;
1176

    
1177
    if ( a == 0 ) return 0;
1178
    if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179
        return packFloat64( 1, 0x43E, 0 );
1180
    }
1181
    zSign = ( a < 0 );
1182
    return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183

    
1184
}
1185

    
1186
#ifdef FLOATX80
1187

    
1188
/*----------------------------------------------------------------------------
1189
| Returns the result of converting the 64-bit two's complement integer `a'
1190
| to the extended double-precision floating-point format.  The conversion
1191
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192
| Arithmetic.
1193
*----------------------------------------------------------------------------*/
1194

    
1195
floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196
{
1197
    flag zSign;
1198
    uint64 absA;
1199
    int8 shiftCount;
1200

    
1201
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202
    zSign = ( a < 0 );
1203
    absA = zSign ? - a : a;
1204
    shiftCount = countLeadingZeros64( absA );
1205
    return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206

    
1207
}
1208

    
1209
#endif
1210

    
1211
#ifdef FLOAT128
1212

    
1213
/*----------------------------------------------------------------------------
1214
| Returns the result of converting the 64-bit two's complement integer `a' to
1215
| the quadruple-precision floating-point format.  The conversion is performed
1216
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217
*----------------------------------------------------------------------------*/
1218

    
1219
float128 int64_to_float128( int64 a STATUS_PARAM )
1220
{
1221
    flag zSign;
1222
    uint64 absA;
1223
    int8 shiftCount;
1224
    int32 zExp;
1225
    bits64 zSig0, zSig1;
1226

    
1227
    if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228
    zSign = ( a < 0 );
1229
    absA = zSign ? - a : a;
1230
    shiftCount = countLeadingZeros64( absA ) + 49;
1231
    zExp = 0x406E - shiftCount;
1232
    if ( 64 <= shiftCount ) {
1233
        zSig1 = 0;
1234
        zSig0 = absA;
1235
        shiftCount -= 64;
1236
    }
1237
    else {
1238
        zSig1 = absA;
1239
        zSig0 = 0;
1240
    }
1241
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242
    return packFloat128( zSign, zExp, zSig0, zSig1 );
1243

    
1244
}
1245

    
1246
#endif
1247

    
1248
/*----------------------------------------------------------------------------
1249
| Returns the result of converting the single-precision floating-point value
1250
| `a' to the 32-bit two's complement integer format.  The conversion is
1251
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1252
| Arithmetic---which means in particular that the conversion is rounded
1253
| according to the current rounding mode.  If `a' is a NaN, the largest
1254
| positive integer is returned.  Otherwise, if the conversion overflows, the
1255
| largest integer with the same sign as `a' is returned.
1256
*----------------------------------------------------------------------------*/
1257

    
1258
int32 float32_to_int32( float32 a STATUS_PARAM )
1259
{
1260
    flag aSign;
1261
    int16 aExp, shiftCount;
1262
    bits32 aSig;
1263
    bits64 aSig64;
1264

    
1265
    aSig = extractFloat32Frac( a );
1266
    aExp = extractFloat32Exp( a );
1267
    aSign = extractFloat32Sign( a );
1268
    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269
    if ( aExp ) aSig |= 0x00800000;
1270
    shiftCount = 0xAF - aExp;
1271
    aSig64 = aSig;
1272
    aSig64 <<= 32;
1273
    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274
    return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275

    
1276
}
1277

    
1278
/*----------------------------------------------------------------------------
1279
| Returns the result of converting the single-precision floating-point value
1280
| `a' to the 32-bit two's complement integer format.  The conversion is
1281
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282
| Arithmetic, except that the conversion is always rounded toward zero.
1283
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1284
| the conversion overflows, the largest integer with the same sign as `a' is
1285
| returned.
1286
*----------------------------------------------------------------------------*/
1287

    
1288
int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289
{
1290
    flag aSign;
1291
    int16 aExp, shiftCount;
1292
    bits32 aSig;
1293
    int32 z;
1294

    
1295
    aSig = extractFloat32Frac( a );
1296
    aExp = extractFloat32Exp( a );
1297
    aSign = extractFloat32Sign( a );
1298
    shiftCount = aExp - 0x9E;
1299
    if ( 0 <= shiftCount ) {
1300
        if ( a != 0xCF000000 ) {
1301
            float_raise( float_flag_invalid STATUS_VAR);
1302
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303
        }
1304
        return (sbits32) 0x80000000;
1305
    }
1306
    else if ( aExp <= 0x7E ) {
1307
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1308
        return 0;
1309
    }
1310
    aSig = ( aSig | 0x00800000 )<<8;
1311
    z = aSig>>( - shiftCount );
1312
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313
        STATUS(float_exception_flags) |= float_flag_inexact;
1314
    }
1315
    if ( aSign ) z = - z;
1316
    return z;
1317

    
1318
}
1319

    
1320
/*----------------------------------------------------------------------------
1321
| Returns the result of converting the single-precision floating-point value
1322
| `a' to the 64-bit two's complement integer format.  The conversion is
1323
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1324
| Arithmetic---which means in particular that the conversion is rounded
1325
| according to the current rounding mode.  If `a' is a NaN, the largest
1326
| positive integer is returned.  Otherwise, if the conversion overflows, the
1327
| largest integer with the same sign as `a' is returned.
1328
*----------------------------------------------------------------------------*/
1329

    
1330
int64 float32_to_int64( float32 a STATUS_PARAM )
1331
{
1332
    flag aSign;
1333
    int16 aExp, shiftCount;
1334
    bits32 aSig;
1335
    bits64 aSig64, aSigExtra;
1336

    
1337
    aSig = extractFloat32Frac( a );
1338
    aExp = extractFloat32Exp( a );
1339
    aSign = extractFloat32Sign( a );
1340
    shiftCount = 0xBE - aExp;
1341
    if ( shiftCount < 0 ) {
1342
        float_raise( float_flag_invalid STATUS_VAR);
1343
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1344
            return LIT64( 0x7FFFFFFFFFFFFFFF );
1345
        }
1346
        return (sbits64) LIT64( 0x8000000000000000 );
1347
    }
1348
    if ( aExp ) aSig |= 0x00800000;
1349
    aSig64 = aSig;
1350
    aSig64 <<= 40;
1351
    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352
    return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353

    
1354
}
1355

    
1356
/*----------------------------------------------------------------------------
1357
| Returns the result of converting the single-precision floating-point value
1358
| `a' to the 64-bit two's complement integer format.  The conversion is
1359
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360
| Arithmetic, except that the conversion is always rounded toward zero.  If
1361
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1362
| conversion overflows, the largest integer with the same sign as `a' is
1363
| returned.
1364
*----------------------------------------------------------------------------*/
1365

    
1366
int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367
{
1368
    flag aSign;
1369
    int16 aExp, shiftCount;
1370
    bits32 aSig;
1371
    bits64 aSig64;
1372
    int64 z;
1373

    
1374
    aSig = extractFloat32Frac( a );
1375
    aExp = extractFloat32Exp( a );
1376
    aSign = extractFloat32Sign( a );
1377
    shiftCount = aExp - 0xBE;
1378
    if ( 0 <= shiftCount ) {
1379
        if ( a != 0xDF000000 ) {
1380
            float_raise( float_flag_invalid STATUS_VAR);
1381
            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1382
                return LIT64( 0x7FFFFFFFFFFFFFFF );
1383
            }
1384
        }
1385
        return (sbits64) LIT64( 0x8000000000000000 );
1386
    }
1387
    else if ( aExp <= 0x7E ) {
1388
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1389
        return 0;
1390
    }
1391
    aSig64 = aSig | 0x00800000;
1392
    aSig64 <<= 40;
1393
    z = aSig64>>( - shiftCount );
1394
    if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395
        STATUS(float_exception_flags) |= float_flag_inexact;
1396
    }
1397
    if ( aSign ) z = - z;
1398
    return z;
1399

    
1400
}
1401

    
1402
/*----------------------------------------------------------------------------
1403
| Returns the result of converting the single-precision floating-point value
1404
| `a' to the double-precision floating-point format.  The conversion is
1405
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1406
| Arithmetic.
1407
*----------------------------------------------------------------------------*/
1408

    
1409
float64 float32_to_float64( float32 a STATUS_PARAM )
1410
{
1411
    flag aSign;
1412
    int16 aExp;
1413
    bits32 aSig;
1414

    
1415
    aSig = extractFloat32Frac( a );
1416
    aExp = extractFloat32Exp( a );
1417
    aSign = extractFloat32Sign( a );
1418
    if ( aExp == 0xFF ) {
1419
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420
        return packFloat64( aSign, 0x7FF, 0 );
1421
    }
1422
    if ( aExp == 0 ) {
1423
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425
        --aExp;
1426
    }
1427
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428

    
1429
}
1430

    
1431
#ifdef FLOATX80
1432

    
1433
/*----------------------------------------------------------------------------
1434
| Returns the result of converting the single-precision floating-point value
1435
| `a' to the extended double-precision floating-point format.  The conversion
1436
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437
| Arithmetic.
1438
*----------------------------------------------------------------------------*/
1439

    
1440
floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441
{
1442
    flag aSign;
1443
    int16 aExp;
1444
    bits32 aSig;
1445

    
1446
    aSig = extractFloat32Frac( a );
1447
    aExp = extractFloat32Exp( a );
1448
    aSign = extractFloat32Sign( a );
1449
    if ( aExp == 0xFF ) {
1450
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452
    }
1453
    if ( aExp == 0 ) {
1454
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456
    }
1457
    aSig |= 0x00800000;
1458
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459

    
1460
}
1461

    
1462
#endif
1463

    
1464
#ifdef FLOAT128
1465

    
1466
/*----------------------------------------------------------------------------
1467
| Returns the result of converting the single-precision floating-point value
1468
| `a' to the double-precision floating-point format.  The conversion is
1469
| performed according to the IEC/IEEE Standard for Binary Floating-Point
1470
| Arithmetic.
1471
*----------------------------------------------------------------------------*/
1472

    
1473
float128 float32_to_float128( float32 a STATUS_PARAM )
1474
{
1475
    flag aSign;
1476
    int16 aExp;
1477
    bits32 aSig;
1478

    
1479
    aSig = extractFloat32Frac( a );
1480
    aExp = extractFloat32Exp( a );
1481
    aSign = extractFloat32Sign( a );
1482
    if ( aExp == 0xFF ) {
1483
        if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484
        return packFloat128( aSign, 0x7FFF, 0, 0 );
1485
    }
1486
    if ( aExp == 0 ) {
1487
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489
        --aExp;
1490
    }
1491
    return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492

    
1493
}
1494

    
1495
#endif
1496

    
1497
/*----------------------------------------------------------------------------
1498
| Rounds the single-precision floating-point value `a' to an integer, and
1499
| returns the result as a single-precision floating-point value.  The
1500
| operation is performed according to the IEC/IEEE Standard for Binary
1501
| Floating-Point Arithmetic.
1502
*----------------------------------------------------------------------------*/
1503

    
1504
float32 float32_round_to_int( float32 a STATUS_PARAM)
1505
{
1506
    flag aSign;
1507
    int16 aExp;
1508
    bits32 lastBitMask, roundBitsMask;
1509
    int8 roundingMode;
1510
    float32 z;
1511

    
1512
    aExp = extractFloat32Exp( a );
1513
    if ( 0x96 <= aExp ) {
1514
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515
            return propagateFloat32NaN( a, a STATUS_VAR );
1516
        }
1517
        return a;
1518
    }
1519
    if ( aExp <= 0x7E ) {
1520
        if ( (bits32) ( a<<1 ) == 0 ) return a;
1521
        STATUS(float_exception_flags) |= float_flag_inexact;
1522
        aSign = extractFloat32Sign( a );
1523
        switch ( STATUS(float_rounding_mode) ) {
1524
         case float_round_nearest_even:
1525
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526
                return packFloat32( aSign, 0x7F, 0 );
1527
            }
1528
            break;
1529
         case float_round_down:
1530
            return aSign ? 0xBF800000 : 0;
1531
         case float_round_up:
1532
            return aSign ? 0x80000000 : 0x3F800000;
1533
        }
1534
        return packFloat32( aSign, 0, 0 );
1535
    }
1536
    lastBitMask = 1;
1537
    lastBitMask <<= 0x96 - aExp;
1538
    roundBitsMask = lastBitMask - 1;
1539
    z = a;
1540
    roundingMode = STATUS(float_rounding_mode);
1541
    if ( roundingMode == float_round_nearest_even ) {
1542
        z += lastBitMask>>1;
1543
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544
    }
1545
    else if ( roundingMode != float_round_to_zero ) {
1546
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547
            z += roundBitsMask;
1548
        }
1549
    }
1550
    z &= ~ roundBitsMask;
1551
    if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1552
    return z;
1553

    
1554
}
1555

    
1556
/*----------------------------------------------------------------------------
1557
| Returns the result of adding the absolute values of the single-precision
1558
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1559
| before being returned.  `zSign' is ignored if the result is a NaN.
1560
| The addition is performed according to the IEC/IEEE Standard for Binary
1561
| Floating-Point Arithmetic.
1562
*----------------------------------------------------------------------------*/
1563

    
1564
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565
{
1566
    int16 aExp, bExp, zExp;
1567
    bits32 aSig, bSig, zSig;
1568
    int16 expDiff;
1569

    
1570
    aSig = extractFloat32Frac( a );
1571
    aExp = extractFloat32Exp( a );
1572
    bSig = extractFloat32Frac( b );
1573
    bExp = extractFloat32Exp( b );
1574
    expDiff = aExp - bExp;
1575
    aSig <<= 6;
1576
    bSig <<= 6;
1577
    if ( 0 < expDiff ) {
1578
        if ( aExp == 0xFF ) {
1579
            if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580
            return a;
1581
        }
1582
        if ( bExp == 0 ) {
1583
            --expDiff;
1584
        }
1585
        else {
1586
            bSig |= 0x20000000;
1587
        }
1588
        shift32RightJamming( bSig, expDiff, &bSig );
1589
        zExp = aExp;
1590
    }
1591
    else if ( expDiff < 0 ) {
1592
        if ( bExp == 0xFF ) {
1593
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594
            return packFloat32( zSign, 0xFF, 0 );
1595
        }
1596
        if ( aExp == 0 ) {
1597
            ++expDiff;
1598
        }
1599
        else {
1600
            aSig |= 0x20000000;
1601
        }
1602
        shift32RightJamming( aSig, - expDiff, &aSig );
1603
        zExp = bExp;
1604
    }
1605
    else {
1606
        if ( aExp == 0xFF ) {
1607
            if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608
            return a;
1609
        }
1610
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611
        zSig = 0x40000000 + aSig + bSig;
1612
        zExp = aExp;
1613
        goto roundAndPack;
1614
    }
1615
    aSig |= 0x20000000;
1616
    zSig = ( aSig + bSig )<<1;
1617
    --zExp;
1618
    if ( (sbits32) zSig < 0 ) {
1619
        zSig = aSig + bSig;
1620
        ++zExp;
1621
    }
1622
 roundAndPack:
1623
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624

    
1625
}
1626

    
1627
/*----------------------------------------------------------------------------
1628
| Returns the result of subtracting the absolute values of the single-
1629
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
1630
| difference is negated before being returned.  `zSign' is ignored if the
1631
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
1632
| Standard for Binary Floating-Point Arithmetic.
1633
*----------------------------------------------------------------------------*/
1634

    
1635
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636
{
1637
    int16 aExp, bExp, zExp;
1638
    bits32 aSig, bSig, zSig;
1639
    int16 expDiff;
1640

    
1641
    aSig = extractFloat32Frac( a );
1642
    aExp = extractFloat32Exp( a );
1643
    bSig = extractFloat32Frac( b );
1644
    bExp = extractFloat32Exp( b );
1645
    expDiff = aExp - bExp;
1646
    aSig <<= 7;
1647
    bSig <<= 7;
1648
    if ( 0 < expDiff ) goto aExpBigger;
1649
    if ( expDiff < 0 ) goto bExpBigger;
1650
    if ( aExp == 0xFF ) {
1651
        if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652
        float_raise( float_flag_invalid STATUS_VAR);
1653
        return float32_default_nan;
1654
    }
1655
    if ( aExp == 0 ) {
1656
        aExp = 1;
1657
        bExp = 1;
1658
    }
1659
    if ( bSig < aSig ) goto aBigger;
1660
    if ( aSig < bSig ) goto bBigger;
1661
    return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662
 bExpBigger:
1663
    if ( bExp == 0xFF ) {
1664
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1666
    }
1667
    if ( aExp == 0 ) {
1668
        ++expDiff;
1669
    }
1670
    else {
1671
        aSig |= 0x40000000;
1672
    }
1673
    shift32RightJamming( aSig, - expDiff, &aSig );
1674
    bSig |= 0x40000000;
1675
 bBigger:
1676
    zSig = bSig - aSig;
1677
    zExp = bExp;
1678
    zSign ^= 1;
1679
    goto normalizeRoundAndPack;
1680
 aExpBigger:
1681
    if ( aExp == 0xFF ) {
1682
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683
        return a;
1684
    }
1685
    if ( bExp == 0 ) {
1686
        --expDiff;
1687
    }
1688
    else {
1689
        bSig |= 0x40000000;
1690
    }
1691
    shift32RightJamming( bSig, expDiff, &bSig );
1692
    aSig |= 0x40000000;
1693
 aBigger:
1694
    zSig = aSig - bSig;
1695
    zExp = aExp;
1696
 normalizeRoundAndPack:
1697
    --zExp;
1698
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699

    
1700
}
1701

    
1702
/*----------------------------------------------------------------------------
1703
| Returns the result of adding the single-precision floating-point values `a'
1704
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
1705
| Binary Floating-Point Arithmetic.
1706
*----------------------------------------------------------------------------*/
1707

    
1708
float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709
{
1710
    flag aSign, bSign;
1711

    
1712
    aSign = extractFloat32Sign( a );
1713
    bSign = extractFloat32Sign( b );
1714
    if ( aSign == bSign ) {
1715
        return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716
    }
1717
    else {
1718
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719
    }
1720

    
1721
}
1722

    
1723
/*----------------------------------------------------------------------------
1724
| Returns the result of subtracting the single-precision floating-point values
1725
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1726
| for Binary Floating-Point Arithmetic.
1727
*----------------------------------------------------------------------------*/
1728

    
1729
float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730
{
1731
    flag aSign, bSign;
1732

    
1733
    aSign = extractFloat32Sign( a );
1734
    bSign = extractFloat32Sign( b );
1735
    if ( aSign == bSign ) {
1736
        return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737
    }
1738
    else {
1739
        return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740
    }
1741

    
1742
}
1743

    
1744
/*----------------------------------------------------------------------------
1745
| Returns the result of multiplying the single-precision floating-point values
1746
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1747
| for Binary Floating-Point Arithmetic.
1748
*----------------------------------------------------------------------------*/
1749

    
1750
float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751
{
1752
    flag aSign, bSign, zSign;
1753
    int16 aExp, bExp, zExp;
1754
    bits32 aSig, bSig;
1755
    bits64 zSig64;
1756
    bits32 zSig;
1757

    
1758
    aSig = extractFloat32Frac( a );
1759
    aExp = extractFloat32Exp( a );
1760
    aSign = extractFloat32Sign( a );
1761
    bSig = extractFloat32Frac( b );
1762
    bExp = extractFloat32Exp( b );
1763
    bSign = extractFloat32Sign( b );
1764
    zSign = aSign ^ bSign;
1765
    if ( aExp == 0xFF ) {
1766
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1767
            return propagateFloat32NaN( a, b STATUS_VAR );
1768
        }
1769
        if ( ( bExp | bSig ) == 0 ) {
1770
            float_raise( float_flag_invalid STATUS_VAR);
1771
            return float32_default_nan;
1772
        }
1773
        return packFloat32( zSign, 0xFF, 0 );
1774
    }
1775
    if ( bExp == 0xFF ) {
1776
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777
        if ( ( aExp | aSig ) == 0 ) {
1778
            float_raise( float_flag_invalid STATUS_VAR);
1779
            return float32_default_nan;
1780
        }
1781
        return packFloat32( zSign, 0xFF, 0 );
1782
    }
1783
    if ( aExp == 0 ) {
1784
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786
    }
1787
    if ( bExp == 0 ) {
1788
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790
    }
1791
    zExp = aExp + bExp - 0x7F;
1792
    aSig = ( aSig | 0x00800000 )<<7;
1793
    bSig = ( bSig | 0x00800000 )<<8;
1794
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795
    zSig = zSig64;
1796
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797
        zSig <<= 1;
1798
        --zExp;
1799
    }
1800
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801

    
1802
}
1803

    
1804
/*----------------------------------------------------------------------------
1805
| Returns the result of dividing the single-precision floating-point value `a'
1806
| by the corresponding value `b'.  The operation is performed according to the
1807
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808
*----------------------------------------------------------------------------*/
1809

    
1810
float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811
{
1812
    flag aSign, bSign, zSign;
1813
    int16 aExp, bExp, zExp;
1814
    bits32 aSig, bSig, zSig;
1815

    
1816
    aSig = extractFloat32Frac( a );
1817
    aExp = extractFloat32Exp( a );
1818
    aSign = extractFloat32Sign( a );
1819
    bSig = extractFloat32Frac( b );
1820
    bExp = extractFloat32Exp( b );
1821
    bSign = extractFloat32Sign( b );
1822
    zSign = aSign ^ bSign;
1823
    if ( aExp == 0xFF ) {
1824
        if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825
        if ( bExp == 0xFF ) {
1826
            if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827
            float_raise( float_flag_invalid STATUS_VAR);
1828
            return float32_default_nan;
1829
        }
1830
        return packFloat32( zSign, 0xFF, 0 );
1831
    }
1832
    if ( bExp == 0xFF ) {
1833
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834
        return packFloat32( zSign, 0, 0 );
1835
    }
1836
    if ( bExp == 0 ) {
1837
        if ( bSig == 0 ) {
1838
            if ( ( aExp | aSig ) == 0 ) {
1839
                float_raise( float_flag_invalid STATUS_VAR);
1840
                return float32_default_nan;
1841
            }
1842
            float_raise( float_flag_divbyzero STATUS_VAR);
1843
            return packFloat32( zSign, 0xFF, 0 );
1844
        }
1845
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846
    }
1847
    if ( aExp == 0 ) {
1848
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850
    }
1851
    zExp = aExp - bExp + 0x7D;
1852
    aSig = ( aSig | 0x00800000 )<<7;
1853
    bSig = ( bSig | 0x00800000 )<<8;
1854
    if ( bSig <= ( aSig + aSig ) ) {
1855
        aSig >>= 1;
1856
        ++zExp;
1857
    }
1858
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859
    if ( ( zSig & 0x3F ) == 0 ) {
1860
        zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861
    }
1862
    return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863

    
1864
}
1865

    
1866
/*----------------------------------------------------------------------------
1867
| Returns the remainder of the single-precision floating-point value `a'
1868
| with respect to the corresponding value `b'.  The operation is performed
1869
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870
*----------------------------------------------------------------------------*/
1871

    
1872
float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873
{
1874
    flag aSign, bSign, zSign;
1875
    int16 aExp, bExp, expDiff;
1876
    bits32 aSig, bSig;
1877
    bits32 q;
1878
    bits64 aSig64, bSig64, q64;
1879
    bits32 alternateASig;
1880
    sbits32 sigMean;
1881

    
1882
    aSig = extractFloat32Frac( a );
1883
    aExp = extractFloat32Exp( a );
1884
    aSign = extractFloat32Sign( a );
1885
    bSig = extractFloat32Frac( b );
1886
    bExp = extractFloat32Exp( b );
1887
    bSign = extractFloat32Sign( b );
1888
    if ( aExp == 0xFF ) {
1889
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1890
            return propagateFloat32NaN( a, b STATUS_VAR );
1891
        }
1892
        float_raise( float_flag_invalid STATUS_VAR);
1893
        return float32_default_nan;
1894
    }
1895
    if ( bExp == 0xFF ) {
1896
        if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897
        return a;
1898
    }
1899
    if ( bExp == 0 ) {
1900
        if ( bSig == 0 ) {
1901
            float_raise( float_flag_invalid STATUS_VAR);
1902
            return float32_default_nan;
1903
        }
1904
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905
    }
1906
    if ( aExp == 0 ) {
1907
        if ( aSig == 0 ) return a;
1908
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909
    }
1910
    expDiff = aExp - bExp;
1911
    aSig |= 0x00800000;
1912
    bSig |= 0x00800000;
1913
    if ( expDiff < 32 ) {
1914
        aSig <<= 8;
1915
        bSig <<= 8;
1916
        if ( expDiff < 0 ) {
1917
            if ( expDiff < -1 ) return a;
1918
            aSig >>= 1;
1919
        }
1920
        q = ( bSig <= aSig );
1921
        if ( q ) aSig -= bSig;
1922
        if ( 0 < expDiff ) {
1923
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1924
            q >>= 32 - expDiff;
1925
            bSig >>= 2;
1926
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927
        }
1928
        else {
1929
            aSig >>= 2;
1930
            bSig >>= 2;
1931
        }
1932
    }
1933
    else {
1934
        if ( bSig <= aSig ) aSig -= bSig;
1935
        aSig64 = ( (bits64) aSig )<<40;
1936
        bSig64 = ( (bits64) bSig )<<40;
1937
        expDiff -= 64;
1938
        while ( 0 < expDiff ) {
1939
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941
            aSig64 = - ( ( bSig * q64 )<<38 );
1942
            expDiff -= 62;
1943
        }
1944
        expDiff += 64;
1945
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947
        q = q64>>( 64 - expDiff );
1948
        bSig <<= 6;
1949
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950
    }
1951
    do {
1952
        alternateASig = aSig;
1953
        ++q;
1954
        aSig -= bSig;
1955
    } while ( 0 <= (sbits32) aSig );
1956
    sigMean = aSig + alternateASig;
1957
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958
        aSig = alternateASig;
1959
    }
1960
    zSign = ( (sbits32) aSig < 0 );
1961
    if ( zSign ) aSig = - aSig;
1962
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963

    
1964
}
1965

    
1966
/*----------------------------------------------------------------------------
1967
| Returns the square root of the single-precision floating-point value `a'.
1968
| The operation is performed according to the IEC/IEEE Standard for Binary
1969
| Floating-Point Arithmetic.
1970
*----------------------------------------------------------------------------*/
1971

    
1972
float32 float32_sqrt( float32 a STATUS_PARAM )
1973
{
1974
    flag aSign;
1975
    int16 aExp, zExp;
1976
    bits32 aSig, zSig;
1977
    bits64 rem, term;
1978

    
1979
    aSig = extractFloat32Frac( a );
1980
    aExp = extractFloat32Exp( a );
1981
    aSign = extractFloat32Sign( a );
1982
    if ( aExp == 0xFF ) {
1983
        if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984
        if ( ! aSign ) return a;
1985
        float_raise( float_flag_invalid STATUS_VAR);
1986
        return float32_default_nan;
1987
    }
1988
    if ( aSign ) {
1989
        if ( ( aExp | aSig ) == 0 ) return a;
1990
        float_raise( float_flag_invalid STATUS_VAR);
1991
        return float32_default_nan;
1992
    }
1993
    if ( aExp == 0 ) {
1994
        if ( aSig == 0 ) return 0;
1995
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996
    }
1997
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998
    aSig = ( aSig | 0x00800000 )<<8;
1999
    zSig = estimateSqrt32( aExp, aSig ) + 2;
2000
    if ( ( zSig & 0x7F ) <= 5 ) {
2001
        if ( zSig < 2 ) {
2002
            zSig = 0x7FFFFFFF;
2003
            goto roundAndPack;
2004
        }
2005
        aSig >>= aExp & 1;
2006
        term = ( (bits64) zSig ) * zSig;
2007
        rem = ( ( (bits64) aSig )<<32 ) - term;
2008
        while ( (sbits64) rem < 0 ) {
2009
            --zSig;
2010
            rem += ( ( (bits64) zSig )<<1 ) | 1;
2011
        }
2012
        zSig |= ( rem != 0 );
2013
    }
2014
    shift32RightJamming( zSig, 1, &zSig );
2015
 roundAndPack:
2016
    return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017

    
2018
}
2019

    
2020
/*----------------------------------------------------------------------------
2021
| Returns 1 if the single-precision floating-point value `a' is equal to
2022
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2023
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024
*----------------------------------------------------------------------------*/
2025

    
2026
flag float32_eq( float32 a, float32 b STATUS_PARAM )
2027
{
2028

    
2029
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031
       ) {
2032
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2033
            float_raise( float_flag_invalid STATUS_VAR);
2034
        }
2035
        return 0;
2036
    }
2037
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2038

    
2039
}
2040

    
2041
/*----------------------------------------------------------------------------
2042
| Returns 1 if the single-precision floating-point value `a' is less than
2043
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
2044
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045
| Arithmetic.
2046
*----------------------------------------------------------------------------*/
2047

    
2048
flag float32_le( float32 a, float32 b STATUS_PARAM )
2049
{
2050
    flag aSign, bSign;
2051

    
2052
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054
       ) {
2055
        float_raise( float_flag_invalid STATUS_VAR);
2056
        return 0;
2057
    }
2058
    aSign = extractFloat32Sign( a );
2059
    bSign = extractFloat32Sign( b );
2060
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2061
    return ( a == b ) || ( aSign ^ ( a < b ) );
2062

    
2063
}
2064

    
2065
/*----------------------------------------------------------------------------
2066
| Returns 1 if the single-precision floating-point value `a' is less than
2067
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2068
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069
*----------------------------------------------------------------------------*/
2070

    
2071
flag float32_lt( float32 a, float32 b STATUS_PARAM )
2072
{
2073
    flag aSign, bSign;
2074

    
2075
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077
       ) {
2078
        float_raise( float_flag_invalid STATUS_VAR);
2079
        return 0;
2080
    }
2081
    aSign = extractFloat32Sign( a );
2082
    bSign = extractFloat32Sign( b );
2083
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2084
    return ( a != b ) && ( aSign ^ ( a < b ) );
2085

    
2086
}
2087

    
2088
/*----------------------------------------------------------------------------
2089
| Returns 1 if the single-precision floating-point value `a' is equal to
2090
| the corresponding value `b', and 0 otherwise.  The invalid exception is
2091
| raised if either operand is a NaN.  Otherwise, the comparison is performed
2092
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093
*----------------------------------------------------------------------------*/
2094

    
2095
flag float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096
{
2097

    
2098
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100
       ) {
2101
        float_raise( float_flag_invalid STATUS_VAR);
2102
        return 0;
2103
    }
2104
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2105

    
2106
}
2107

    
2108
/*----------------------------------------------------------------------------
2109
| Returns 1 if the single-precision floating-point value `a' is less than or
2110
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2111
| cause an exception.  Otherwise, the comparison is performed according to the
2112
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113
*----------------------------------------------------------------------------*/
2114

    
2115
flag float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116
{
2117
    flag aSign, bSign;
2118

    
2119
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121
       ) {
2122
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2123
            float_raise( float_flag_invalid STATUS_VAR);
2124
        }
2125
        return 0;
2126
    }
2127
    aSign = extractFloat32Sign( a );
2128
    bSign = extractFloat32Sign( b );
2129
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2130
    return ( a == b ) || ( aSign ^ ( a < b ) );
2131

    
2132
}
2133

    
2134
/*----------------------------------------------------------------------------
2135
| Returns 1 if the single-precision floating-point value `a' is less than
2136
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2137
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2138
| Standard for Binary Floating-Point Arithmetic.
2139
*----------------------------------------------------------------------------*/
2140

    
2141
flag float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142
{
2143
    flag aSign, bSign;
2144

    
2145
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147
       ) {
2148
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149
            float_raise( float_flag_invalid STATUS_VAR);
2150
        }
2151
        return 0;
2152
    }
2153
    aSign = extractFloat32Sign( a );
2154
    bSign = extractFloat32Sign( b );
2155
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2156
    return ( a != b ) && ( aSign ^ ( a < b ) );
2157

    
2158
}
2159

    
2160
/*----------------------------------------------------------------------------
2161
| Returns the result of converting the double-precision floating-point value
2162
| `a' to the 32-bit two's complement integer format.  The conversion is
2163
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2164
| Arithmetic---which means in particular that the conversion is rounded
2165
| according to the current rounding mode.  If `a' is a NaN, the largest
2166
| positive integer is returned.  Otherwise, if the conversion overflows, the
2167
| largest integer with the same sign as `a' is returned.
2168
*----------------------------------------------------------------------------*/
2169

    
2170
int32 float64_to_int32( float64 a STATUS_PARAM )
2171
{
2172
    flag aSign;
2173
    int16 aExp, shiftCount;
2174
    bits64 aSig;
2175

    
2176
    aSig = extractFloat64Frac( a );
2177
    aExp = extractFloat64Exp( a );
2178
    aSign = extractFloat64Sign( a );
2179
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2181
    shiftCount = 0x42C - aExp;
2182
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184

    
2185
}
2186

    
2187
/*----------------------------------------------------------------------------
2188
| Returns the result of converting the double-precision floating-point value
2189
| `a' to the 32-bit two's complement integer format.  The conversion is
2190
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2191
| Arithmetic, except that the conversion is always rounded toward zero.
2192
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2193
| the conversion overflows, the largest integer with the same sign as `a' is
2194
| returned.
2195
*----------------------------------------------------------------------------*/
2196

    
2197
int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198
{
2199
    flag aSign;
2200
    int16 aExp, shiftCount;
2201
    bits64 aSig, savedASig;
2202
    int32 z;
2203

    
2204
    aSig = extractFloat64Frac( a );
2205
    aExp = extractFloat64Exp( a );
2206
    aSign = extractFloat64Sign( a );
2207
    if ( 0x41E < aExp ) {
2208
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209
        goto invalid;
2210
    }
2211
    else if ( aExp < 0x3FF ) {
2212
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2213
        return 0;
2214
    }
2215
    aSig |= LIT64( 0x0010000000000000 );
2216
    shiftCount = 0x433 - aExp;
2217
    savedASig = aSig;
2218
    aSig >>= shiftCount;
2219
    z = aSig;
2220
    if ( aSign ) z = - z;
2221
    if ( ( z < 0 ) ^ aSign ) {
2222
 invalid:
2223
        float_raise( float_flag_invalid STATUS_VAR);
2224
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225
    }
2226
    if ( ( aSig<<shiftCount ) != savedASig ) {
2227
        STATUS(float_exception_flags) |= float_flag_inexact;
2228
    }
2229
    return z;
2230

    
2231
}
2232

    
2233
/*----------------------------------------------------------------------------
2234
| Returns the result of converting the double-precision floating-point value
2235
| `a' to the 64-bit two's complement integer format.  The conversion is
2236
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237
| Arithmetic---which means in particular that the conversion is rounded
2238
| according to the current rounding mode.  If `a' is a NaN, the largest
2239
| positive integer is returned.  Otherwise, if the conversion overflows, the
2240
| largest integer with the same sign as `a' is returned.
2241
*----------------------------------------------------------------------------*/
2242

    
2243
int64 float64_to_int64( float64 a STATUS_PARAM )
2244
{
2245
    flag aSign;
2246
    int16 aExp, shiftCount;
2247
    bits64 aSig, aSigExtra;
2248

    
2249
    aSig = extractFloat64Frac( a );
2250
    aExp = extractFloat64Exp( a );
2251
    aSign = extractFloat64Sign( a );
2252
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2253
    shiftCount = 0x433 - aExp;
2254
    if ( shiftCount <= 0 ) {
2255
        if ( 0x43E < aExp ) {
2256
            float_raise( float_flag_invalid STATUS_VAR);
2257
            if (    ! aSign
2258
                 || (    ( aExp == 0x7FF )
2259
                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
2260
               ) {
2261
                return LIT64( 0x7FFFFFFFFFFFFFFF );
2262
            }
2263
            return (sbits64) LIT64( 0x8000000000000000 );
2264
        }
2265
        aSigExtra = 0;
2266
        aSig <<= - shiftCount;
2267
    }
2268
    else {
2269
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270
    }
2271
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272

    
2273
}
2274

    
2275
/*----------------------------------------------------------------------------
2276
| Returns the result of converting the double-precision floating-point value
2277
| `a' to the 64-bit two's complement integer format.  The conversion is
2278
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2279
| Arithmetic, except that the conversion is always rounded toward zero.
2280
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2281
| the conversion overflows, the largest integer with the same sign as `a' is
2282
| returned.
2283
*----------------------------------------------------------------------------*/
2284

    
2285
int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286
{
2287
    flag aSign;
2288
    int16 aExp, shiftCount;
2289
    bits64 aSig;
2290
    int64 z;
2291

    
2292
    aSig = extractFloat64Frac( a );
2293
    aExp = extractFloat64Exp( a );
2294
    aSign = extractFloat64Sign( a );
2295
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2296
    shiftCount = aExp - 0x433;
2297
    if ( 0 <= shiftCount ) {
2298
        if ( 0x43E <= aExp ) {
2299
            if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300
                float_raise( float_flag_invalid STATUS_VAR);
2301
                if (    ! aSign
2302
                     || (    ( aExp == 0x7FF )
2303
                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
2304
                   ) {
2305
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
2306
                }
2307
            }
2308
            return (sbits64) LIT64( 0x8000000000000000 );
2309
        }
2310
        z = aSig<<shiftCount;
2311
    }
2312
    else {
2313
        if ( aExp < 0x3FE ) {
2314
            if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315
            return 0;
2316
        }
2317
        z = aSig>>( - shiftCount );
2318
        if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319
            STATUS(float_exception_flags) |= float_flag_inexact;
2320
        }
2321
    }
2322
    if ( aSign ) z = - z;
2323
    return z;
2324

    
2325
}
2326

    
2327
/*----------------------------------------------------------------------------
2328
| Returns the result of converting the double-precision floating-point value
2329
| `a' to the single-precision floating-point format.  The conversion is
2330
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2331
| Arithmetic.
2332
*----------------------------------------------------------------------------*/
2333

    
2334
float32 float64_to_float32( float64 a STATUS_PARAM )
2335
{
2336
    flag aSign;
2337
    int16 aExp;
2338
    bits64 aSig;
2339
    bits32 zSig;
2340

    
2341
    aSig = extractFloat64Frac( a );
2342
    aExp = extractFloat64Exp( a );
2343
    aSign = extractFloat64Sign( a );
2344
    if ( aExp == 0x7FF ) {
2345
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346
        return packFloat32( aSign, 0xFF, 0 );
2347
    }
2348
    shift64RightJamming( aSig, 22, &aSig );
2349
    zSig = aSig;
2350
    if ( aExp || zSig ) {
2351
        zSig |= 0x40000000;
2352
        aExp -= 0x381;
2353
    }
2354
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355

    
2356
}
2357

    
2358
#ifdef FLOATX80
2359

    
2360
/*----------------------------------------------------------------------------
2361
| Returns the result of converting the double-precision floating-point value
2362
| `a' to the extended double-precision floating-point format.  The conversion
2363
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364
| Arithmetic.
2365
*----------------------------------------------------------------------------*/
2366

    
2367
floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368
{
2369
    flag aSign;
2370
    int16 aExp;
2371
    bits64 aSig;
2372

    
2373
    aSig = extractFloat64Frac( a );
2374
    aExp = extractFloat64Exp( a );
2375
    aSign = extractFloat64Sign( a );
2376
    if ( aExp == 0x7FF ) {
2377
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379
    }
2380
    if ( aExp == 0 ) {
2381
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383
    }
2384
    return
2385
        packFloatx80(
2386
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2387

    
2388
}
2389

    
2390
#endif
2391

    
2392
#ifdef FLOAT128
2393

    
2394
/*----------------------------------------------------------------------------
2395
| Returns the result of converting the double-precision floating-point value
2396
| `a' to the quadruple-precision floating-point format.  The conversion is
2397
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2398
| Arithmetic.
2399
*----------------------------------------------------------------------------*/
2400

    
2401
float128 float64_to_float128( float64 a STATUS_PARAM )
2402
{
2403
    flag aSign;
2404
    int16 aExp;
2405
    bits64 aSig, zSig0, zSig1;
2406

    
2407
    aSig = extractFloat64Frac( a );
2408
    aExp = extractFloat64Exp( a );
2409
    aSign = extractFloat64Sign( a );
2410
    if ( aExp == 0x7FF ) {
2411
        if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412
        return packFloat128( aSign, 0x7FFF, 0, 0 );
2413
    }
2414
    if ( aExp == 0 ) {
2415
        if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417
        --aExp;
2418
    }
2419
    shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420
    return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421

    
2422
}
2423

    
2424
#endif
2425

    
2426
/*----------------------------------------------------------------------------
2427
| Rounds the double-precision floating-point value `a' to an integer, and
2428
| returns the result as a double-precision floating-point value.  The
2429
| operation is performed according to the IEC/IEEE Standard for Binary
2430
| Floating-Point Arithmetic.
2431
*----------------------------------------------------------------------------*/
2432

    
2433
float64 float64_round_to_int( float64 a STATUS_PARAM )
2434
{
2435
    flag aSign;
2436
    int16 aExp;
2437
    bits64 lastBitMask, roundBitsMask;
2438
    int8 roundingMode;
2439
    float64 z;
2440

    
2441
    aExp = extractFloat64Exp( a );
2442
    if ( 0x433 <= aExp ) {
2443
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444
            return propagateFloat64NaN( a, a STATUS_VAR );
2445
        }
2446
        return a;
2447
    }
2448
    if ( aExp < 0x3FF ) {
2449
        if ( (bits64) ( a<<1 ) == 0 ) return a;
2450
        STATUS(float_exception_flags) |= float_flag_inexact;
2451
        aSign = extractFloat64Sign( a );
2452
        switch ( STATUS(float_rounding_mode) ) {
2453
         case float_round_nearest_even:
2454
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455
                return packFloat64( aSign, 0x3FF, 0 );
2456
            }
2457
            break;
2458
         case float_round_down:
2459
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460
         case float_round_up:
2461
            return
2462
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463
        }
2464
        return packFloat64( aSign, 0, 0 );
2465
    }
2466
    lastBitMask = 1;
2467
    lastBitMask <<= 0x433 - aExp;
2468
    roundBitsMask = lastBitMask - 1;
2469
    z = a;
2470
    roundingMode = STATUS(float_rounding_mode);
2471
    if ( roundingMode == float_round_nearest_even ) {
2472
        z += lastBitMask>>1;
2473
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474
    }
2475
    else if ( roundingMode != float_round_to_zero ) {
2476
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477
            z += roundBitsMask;
2478
        }
2479
    }
2480
    z &= ~ roundBitsMask;
2481
    if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2482
    return z;
2483

    
2484
}
2485

    
2486
/*----------------------------------------------------------------------------
2487
| Returns the result of adding the absolute values of the double-precision
2488
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2489
| before being returned.  `zSign' is ignored if the result is a NaN.
2490
| The addition is performed according to the IEC/IEEE Standard for Binary
2491
| Floating-Point Arithmetic.
2492
*----------------------------------------------------------------------------*/
2493

    
2494
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2495
{
2496
    int16 aExp, bExp, zExp;
2497
    bits64 aSig, bSig, zSig;
2498
    int16 expDiff;
2499

    
2500
    aSig = extractFloat64Frac( a );
2501
    aExp = extractFloat64Exp( a );
2502
    bSig = extractFloat64Frac( b );
2503
    bExp = extractFloat64Exp( b );
2504
    expDiff = aExp - bExp;
2505
    aSig <<= 9;
2506
    bSig <<= 9;
2507
    if ( 0 < expDiff ) {
2508
        if ( aExp == 0x7FF ) {
2509
            if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2510
            return a;
2511
        }
2512
        if ( bExp == 0 ) {
2513
            --expDiff;
2514
        }
2515
        else {
2516
            bSig |= LIT64( 0x2000000000000000 );
2517
        }
2518
        shift64RightJamming( bSig, expDiff, &bSig );
2519
        zExp = aExp;
2520
    }
2521
    else if ( expDiff < 0 ) {
2522
        if ( bExp == 0x7FF ) {
2523
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2524
            return packFloat64( zSign, 0x7FF, 0 );
2525
        }
2526
        if ( aExp == 0 ) {
2527
            ++expDiff;
2528
        }
2529
        else {
2530
            aSig |= LIT64( 0x2000000000000000 );
2531
        }
2532
        shift64RightJamming( aSig, - expDiff, &aSig );
2533
        zExp = bExp;
2534
    }
2535
    else {
2536
        if ( aExp == 0x7FF ) {
2537
            if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2538
            return a;
2539
        }
2540
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2541
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2542
        zExp = aExp;
2543
        goto roundAndPack;
2544
    }
2545
    aSig |= LIT64( 0x2000000000000000 );
2546
    zSig = ( aSig + bSig )<<1;
2547
    --zExp;
2548
    if ( (sbits64) zSig < 0 ) {
2549
        zSig = aSig + bSig;
2550
        ++zExp;
2551
    }
2552
 roundAndPack:
2553
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2554

    
2555
}
2556

    
2557
/*----------------------------------------------------------------------------
2558
| Returns the result of subtracting the absolute values of the double-
2559
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
2560
| difference is negated before being returned.  `zSign' is ignored if the
2561
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
2562
| Standard for Binary Floating-Point Arithmetic.
2563
*----------------------------------------------------------------------------*/
2564

    
2565
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2566
{
2567
    int16 aExp, bExp, zExp;
2568
    bits64 aSig, bSig, zSig;
2569
    int16 expDiff;
2570

    
2571
    aSig = extractFloat64Frac( a );
2572
    aExp = extractFloat64Exp( a );
2573
    bSig = extractFloat64Frac( b );
2574
    bExp = extractFloat64Exp( b );
2575
    expDiff = aExp - bExp;
2576
    aSig <<= 10;
2577
    bSig <<= 10;
2578
    if ( 0 < expDiff ) goto aExpBigger;
2579
    if ( expDiff < 0 ) goto bExpBigger;
2580
    if ( aExp == 0x7FF ) {
2581
        if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2582
        float_raise( float_flag_invalid STATUS_VAR);
2583
        return float64_default_nan;
2584
    }
2585
    if ( aExp == 0 ) {
2586
        aExp = 1;
2587
        bExp = 1;
2588
    }
2589
    if ( bSig < aSig ) goto aBigger;
2590
    if ( aSig < bSig ) goto bBigger;
2591
    return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2592
 bExpBigger:
2593
    if ( bExp == 0x7FF ) {
2594
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2595
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2596
    }
2597
    if ( aExp == 0 ) {
2598
        ++expDiff;
2599
    }
2600
    else {
2601
        aSig |= LIT64( 0x4000000000000000 );
2602
    }
2603
    shift64RightJamming( aSig, - expDiff, &aSig );
2604
    bSig |= LIT64( 0x4000000000000000 );
2605
 bBigger:
2606
    zSig = bSig - aSig;
2607
    zExp = bExp;
2608
    zSign ^= 1;
2609
    goto normalizeRoundAndPack;
2610
 aExpBigger:
2611
    if ( aExp == 0x7FF ) {
2612
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2613
        return a;
2614
    }
2615
    if ( bExp == 0 ) {
2616
        --expDiff;
2617
    }
2618
    else {
2619
        bSig |= LIT64( 0x4000000000000000 );
2620
    }
2621
    shift64RightJamming( bSig, expDiff, &bSig );
2622
    aSig |= LIT64( 0x4000000000000000 );
2623
 aBigger:
2624
    zSig = aSig - bSig;
2625
    zExp = aExp;
2626
 normalizeRoundAndPack:
2627
    --zExp;
2628
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2629

    
2630
}
2631

    
2632
/*----------------------------------------------------------------------------
2633
| Returns the result of adding the double-precision floating-point values `a'
2634
| and `b'.  The operation is performed according to the IEC/IEEE Standard for
2635
| Binary Floating-Point Arithmetic.
2636
*----------------------------------------------------------------------------*/
2637

    
2638
float64 float64_add( float64 a, float64 b STATUS_PARAM )
2639
{
2640
    flag aSign, bSign;
2641

    
2642
    aSign = extractFloat64Sign( a );
2643
    bSign = extractFloat64Sign( b );
2644
    if ( aSign == bSign ) {
2645
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2646
    }
2647
    else {
2648
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2649
    }
2650

    
2651
}
2652

    
2653
/*----------------------------------------------------------------------------
2654
| Returns the result of subtracting the double-precision floating-point values
2655
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2656
| for Binary Floating-Point Arithmetic.
2657
*----------------------------------------------------------------------------*/
2658

    
2659
float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2660
{
2661
    flag aSign, bSign;
2662

    
2663
    aSign = extractFloat64Sign( a );
2664
    bSign = extractFloat64Sign( b );
2665
    if ( aSign == bSign ) {
2666
        return subFloat64Sigs( a, b, aSign STATUS_VAR );
2667
    }
2668
    else {
2669
        return addFloat64Sigs( a, b, aSign STATUS_VAR );
2670
    }
2671

    
2672
}
2673

    
2674
/*----------------------------------------------------------------------------
2675
| Returns the result of multiplying the double-precision floating-point values
2676
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2677
| for Binary Floating-Point Arithmetic.
2678
*----------------------------------------------------------------------------*/
2679

    
2680
float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2681
{
2682
    flag aSign, bSign, zSign;
2683
    int16 aExp, bExp, zExp;
2684
    bits64 aSig, bSig, zSig0, zSig1;
2685

    
2686
    aSig = extractFloat64Frac( a );
2687
    aExp = extractFloat64Exp( a );
2688
    aSign = extractFloat64Sign( a );
2689
    bSig = extractFloat64Frac( b );
2690
    bExp = extractFloat64Exp( b );
2691
    bSign = extractFloat64Sign( b );
2692
    zSign = aSign ^ bSign;
2693
    if ( aExp == 0x7FF ) {
2694
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2695
            return propagateFloat64NaN( a, b STATUS_VAR );
2696
        }
2697
        if ( ( bExp | bSig ) == 0 ) {
2698
            float_raise( float_flag_invalid STATUS_VAR);
2699
            return float64_default_nan;
2700
        }
2701
        return packFloat64( zSign, 0x7FF, 0 );
2702
    }
2703
    if ( bExp == 0x7FF ) {
2704
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2705
        if ( ( aExp | aSig ) == 0 ) {
2706
            float_raise( float_flag_invalid STATUS_VAR);
2707
            return float64_default_nan;
2708
        }
2709
        return packFloat64( zSign, 0x7FF, 0 );
2710
    }
2711
    if ( aExp == 0 ) {
2712
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2713
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2714
    }
2715
    if ( bExp == 0 ) {
2716
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2717
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2718
    }
2719
    zExp = aExp + bExp - 0x3FF;
2720
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2721
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2722
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2723
    zSig0 |= ( zSig1 != 0 );
2724
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2725
        zSig0 <<= 1;
2726
        --zExp;
2727
    }
2728
    return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2729

    
2730
}
2731

    
2732
/*----------------------------------------------------------------------------
2733
| Returns the result of dividing the double-precision floating-point value `a'
2734
| by the corresponding value `b'.  The operation is performed according to
2735
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2736
*----------------------------------------------------------------------------*/
2737

    
2738
float64 float64_div( float64 a, float64 b STATUS_PARAM )
2739
{
2740
    flag aSign, bSign, zSign;
2741
    int16 aExp, bExp, zExp;
2742
    bits64 aSig, bSig, zSig;
2743
    bits64 rem0, rem1;
2744
    bits64 term0, term1;
2745

    
2746
    aSig = extractFloat64Frac( a );
2747
    aExp = extractFloat64Exp( a );
2748
    aSign = extractFloat64Sign( a );
2749
    bSig = extractFloat64Frac( b );
2750
    bExp = extractFloat64Exp( b );
2751
    bSign = extractFloat64Sign( b );
2752
    zSign = aSign ^ bSign;
2753
    if ( aExp == 0x7FF ) {
2754
        if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2755
        if ( bExp == 0x7FF ) {
2756
            if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2757
            float_raise( float_flag_invalid STATUS_VAR);
2758
            return float64_default_nan;
2759
        }
2760
        return packFloat64( zSign, 0x7FF, 0 );
2761
    }
2762
    if ( bExp == 0x7FF ) {
2763
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2764
        return packFloat64( zSign, 0, 0 );
2765
    }
2766
    if ( bExp == 0 ) {
2767
        if ( bSig == 0 ) {
2768
            if ( ( aExp | aSig ) == 0 ) {
2769
                float_raise( float_flag_invalid STATUS_VAR);
2770
                return float64_default_nan;
2771
            }
2772
            float_raise( float_flag_divbyzero STATUS_VAR);
2773
            return packFloat64( zSign, 0x7FF, 0 );
2774
        }
2775
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2776
    }
2777
    if ( aExp == 0 ) {
2778
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2779
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2780
    }
2781
    zExp = aExp - bExp + 0x3FD;
2782
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2783
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2784
    if ( bSig <= ( aSig + aSig ) ) {
2785
        aSig >>= 1;
2786
        ++zExp;
2787
    }
2788
    zSig = estimateDiv128To64( aSig, 0, bSig );
2789
    if ( ( zSig & 0x1FF ) <= 2 ) {
2790
        mul64To128( bSig, zSig, &term0, &term1 );
2791
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2792
        while ( (sbits64) rem0 < 0 ) {
2793
            --zSig;
2794
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2795
        }
2796
        zSig |= ( rem1 != 0 );
2797
    }
2798
    return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2799

    
2800
}
2801

    
2802
/*----------------------------------------------------------------------------
2803
| Returns the remainder of the double-precision floating-point value `a'
2804
| with respect to the corresponding value `b'.  The operation is performed
2805
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2806
*----------------------------------------------------------------------------*/
2807

    
2808
float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2809
{
2810
    flag aSign, bSign, zSign;
2811
    int16 aExp, bExp, expDiff;
2812
    bits64 aSig, bSig;
2813
    bits64 q, alternateASig;
2814
    sbits64 sigMean;
2815

    
2816
    aSig = extractFloat64Frac( a );
2817
    aExp = extractFloat64Exp( a );
2818
    aSign = extractFloat64Sign( a );
2819
    bSig = extractFloat64Frac( b );
2820
    bExp = extractFloat64Exp( b );
2821
    bSign = extractFloat64Sign( b );
2822
    if ( aExp == 0x7FF ) {
2823
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2824
            return propagateFloat64NaN( a, b STATUS_VAR );
2825
        }
2826
        float_raise( float_flag_invalid STATUS_VAR);
2827
        return float64_default_nan;
2828
    }
2829
    if ( bExp == 0x7FF ) {
2830
        if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2831
        return a;
2832
    }
2833
    if ( bExp == 0 ) {
2834
        if ( bSig == 0 ) {
2835
            float_raise( float_flag_invalid STATUS_VAR);
2836
            return float64_default_nan;
2837
        }
2838
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2839
    }
2840
    if ( aExp == 0 ) {
2841
        if ( aSig == 0 ) return a;
2842
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2843
    }
2844
    expDiff = aExp - bExp;
2845
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2846
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2847
    if ( expDiff < 0 ) {
2848
        if ( expDiff < -1 ) return a;
2849
        aSig >>= 1;
2850
    }
2851
    q = ( bSig <= aSig );
2852
    if ( q ) aSig -= bSig;
2853
    expDiff -= 64;
2854
    while ( 0 < expDiff ) {
2855
        q = estimateDiv128To64( aSig, 0, bSig );
2856
        q = ( 2 < q ) ? q - 2 : 0;
2857
        aSig = - ( ( bSig>>2 ) * q );
2858
        expDiff -= 62;
2859
    }
2860
    expDiff += 64;
2861
    if ( 0 < expDiff ) {
2862
        q = estimateDiv128To64( aSig, 0, bSig );
2863
        q = ( 2 < q ) ? q - 2 : 0;
2864
        q >>= 64 - expDiff;
2865
        bSig >>= 2;
2866
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2867
    }
2868
    else {
2869
        aSig >>= 2;
2870
        bSig >>= 2;
2871
    }
2872
    do {
2873
        alternateASig = aSig;
2874
        ++q;
2875
        aSig -= bSig;
2876
    } while ( 0 <= (sbits64) aSig );
2877
    sigMean = aSig + alternateASig;
2878
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2879
        aSig = alternateASig;
2880
    }
2881
    zSign = ( (sbits64) aSig < 0 );
2882
    if ( zSign ) aSig = - aSig;
2883
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2884

    
2885
}
2886

    
2887
/*----------------------------------------------------------------------------
2888
| Returns the square root of the double-precision floating-point value `a'.
2889
| The operation is performed according to the IEC/IEEE Standard for Binary
2890
| Floating-Point Arithmetic.
2891
*----------------------------------------------------------------------------*/
2892

    
2893
float64 float64_sqrt( float64 a STATUS_PARAM )
2894
{
2895
    flag aSign;
2896
    int16 aExp, zExp;
2897
    bits64 aSig, zSig, doubleZSig;
2898
    bits64 rem0, rem1, term0, term1;
2899

    
2900
    aSig = extractFloat64Frac( a );
2901
    aExp = extractFloat64Exp( a );
2902
    aSign = extractFloat64Sign( a );
2903
    if ( aExp == 0x7FF ) {
2904
        if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2905
        if ( ! aSign ) return a;
2906
        float_raise( float_flag_invalid STATUS_VAR);
2907
        return float64_default_nan;
2908
    }
2909
    if ( aSign ) {
2910
        if ( ( aExp | aSig ) == 0 ) return a;
2911
        float_raise( float_flag_invalid STATUS_VAR);
2912
        return float64_default_nan;
2913
    }
2914
    if ( aExp == 0 ) {
2915
        if ( aSig == 0 ) return 0;
2916
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2917
    }
2918
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2919
    aSig |= LIT64( 0x0010000000000000 );
2920
    zSig = estimateSqrt32( aExp, aSig>>21 );
2921
    aSig <<= 9 - ( aExp & 1 );
2922
    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2923
    if ( ( zSig & 0x1FF ) <= 5 ) {
2924
        doubleZSig = zSig<<1;
2925
        mul64To128( zSig, zSig, &term0, &term1 );
2926
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2927
        while ( (sbits64) rem0 < 0 ) {
2928
            --zSig;
2929
            doubleZSig -= 2;
2930
            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2931
        }
2932
        zSig |= ( ( rem0 | rem1 ) != 0 );
2933
    }
2934
    return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2935

    
2936
}
2937

    
2938
/*----------------------------------------------------------------------------
2939
| Returns 1 if the double-precision floating-point value `a' is equal to the
2940
| corresponding value `b', and 0 otherwise.  The comparison is performed
2941
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942
*----------------------------------------------------------------------------*/
2943

    
2944
flag float64_eq( float64 a, float64 b STATUS_PARAM )
2945
{
2946

    
2947
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2948
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2949
       ) {
2950
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2951
            float_raise( float_flag_invalid STATUS_VAR);
2952
        }
2953
        return 0;
2954
    }
2955
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2956

    
2957
}
2958

    
2959
/*----------------------------------------------------------------------------
2960
| Returns 1 if the double-precision floating-point value `a' is less than or
2961
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
2962
| performed according to the IEC/IEEE Standard for Binary Floating-Point
2963
| Arithmetic.
2964
*----------------------------------------------------------------------------*/
2965

    
2966
flag float64_le( float64 a, float64 b STATUS_PARAM )
2967
{
2968
    flag aSign, bSign;
2969

    
2970
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2971
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2972
       ) {
2973
        float_raise( float_flag_invalid STATUS_VAR);
2974
        return 0;
2975
    }
2976
    aSign = extractFloat64Sign( a );
2977
    bSign = extractFloat64Sign( b );
2978
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2979
    return ( a == b ) || ( aSign ^ ( a < b ) );
2980

    
2981
}
2982

    
2983
/*----------------------------------------------------------------------------
2984
| Returns 1 if the double-precision floating-point value `a' is less than
2985
| the corresponding value `b', and 0 otherwise.  The comparison is performed
2986
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987
*----------------------------------------------------------------------------*/
2988

    
2989
flag float64_lt( float64 a, float64 b STATUS_PARAM )
2990
{
2991
    flag aSign, bSign;
2992

    
2993
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2994
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2995
       ) {
2996
        float_raise( float_flag_invalid STATUS_VAR);
2997
        return 0;
2998
    }
2999
    aSign = extractFloat64Sign( a );
3000
    bSign = extractFloat64Sign( b );
3001
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3002
    return ( a != b ) && ( aSign ^ ( a < b ) );
3003

    
3004
}
3005

    
3006
/*----------------------------------------------------------------------------
3007
| Returns 1 if the double-precision floating-point value `a' is equal to the
3008
| corresponding value `b', and 0 otherwise.  The invalid exception is raised
3009
| if either operand is a NaN.  Otherwise, the comparison is performed
3010
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011
*----------------------------------------------------------------------------*/
3012

    
3013
flag float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3014
{
3015

    
3016
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3017
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3018
       ) {
3019
        float_raise( float_flag_invalid STATUS_VAR);
3020
        return 0;
3021
    }
3022
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3023

    
3024
}
3025

    
3026
/*----------------------------------------------------------------------------
3027
| Returns 1 if the double-precision floating-point value `a' is less than or
3028
| equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3029
| cause an exception.  Otherwise, the comparison is performed according to the
3030
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3031
*----------------------------------------------------------------------------*/
3032

    
3033
flag float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3034
{
3035
    flag aSign, bSign;
3036

    
3037
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3038
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3039
       ) {
3040
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3041
            float_raise( float_flag_invalid STATUS_VAR);
3042
        }
3043
        return 0;
3044
    }
3045
    aSign = extractFloat64Sign( a );
3046
    bSign = extractFloat64Sign( b );
3047
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3048
    return ( a == b ) || ( aSign ^ ( a < b ) );
3049

    
3050
}
3051

    
3052
/*----------------------------------------------------------------------------
3053
| Returns 1 if the double-precision floating-point value `a' is less than
3054
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3055
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3056
| Standard for Binary Floating-Point Arithmetic.
3057
*----------------------------------------------------------------------------*/
3058

    
3059
flag float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3060
{
3061
    flag aSign, bSign;
3062

    
3063
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3064
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3065
       ) {
3066
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3067
            float_raise( float_flag_invalid STATUS_VAR);
3068
        }
3069
        return 0;
3070
    }
3071
    aSign = extractFloat64Sign( a );
3072
    bSign = extractFloat64Sign( b );
3073
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3074
    return ( a != b ) && ( aSign ^ ( a < b ) );
3075

    
3076
}
3077

    
3078
#ifdef FLOATX80
3079

    
3080
/*----------------------------------------------------------------------------
3081
| Returns the result of converting the extended double-precision floating-
3082
| point value `a' to the 32-bit two's complement integer format.  The
3083
| conversion is performed according to the IEC/IEEE Standard for Binary
3084
| Floating-Point Arithmetic---which means in particular that the conversion
3085
| is rounded according to the current rounding mode.  If `a' is a NaN, the
3086
| largest positive integer is returned.  Otherwise, if the conversion
3087
| overflows, the largest integer with the same sign as `a' is returned.
3088
*----------------------------------------------------------------------------*/
3089

    
3090
int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3091
{
3092
    flag aSign;
3093
    int32 aExp, shiftCount;
3094
    bits64 aSig;
3095

    
3096
    aSig = extractFloatx80Frac( a );
3097
    aExp = extractFloatx80Exp( a );
3098
    aSign = extractFloatx80Sign( a );
3099
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3100
    shiftCount = 0x4037 - aExp;
3101
    if ( shiftCount <= 0 ) shiftCount = 1;
3102
    shift64RightJamming( aSig, shiftCount, &aSig );
3103
    return roundAndPackInt32( aSign, aSig STATUS_VAR );
3104

    
3105
}
3106

    
3107
/*----------------------------------------------------------------------------
3108
| Returns the result of converting the extended double-precision floating-
3109
| point value `a' to the 32-bit two's complement integer format.  The
3110
| conversion is performed according to the IEC/IEEE Standard for Binary
3111
| Floating-Point Arithmetic, except that the conversion is always rounded
3112
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3113
| Otherwise, if the conversion overflows, the largest integer with the same
3114
| sign as `a' is returned.
3115
*----------------------------------------------------------------------------*/
3116

    
3117
int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3118
{
3119
    flag aSign;
3120
    int32 aExp, shiftCount;
3121
    bits64 aSig, savedASig;
3122
    int32 z;
3123

    
3124
    aSig = extractFloatx80Frac( a );
3125
    aExp = extractFloatx80Exp( a );
3126
    aSign = extractFloatx80Sign( a );
3127
    if ( 0x401E < aExp ) {
3128
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3129
        goto invalid;
3130
    }
3131
    else if ( aExp < 0x3FFF ) {
3132
        if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3133
        return 0;
3134
    }
3135
    shiftCount = 0x403E - aExp;
3136
    savedASig = aSig;
3137
    aSig >>= shiftCount;
3138
    z = aSig;
3139
    if ( aSign ) z = - z;
3140
    if ( ( z < 0 ) ^ aSign ) {
3141
 invalid:
3142
        float_raise( float_flag_invalid STATUS_VAR);
3143
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3144
    }
3145
    if ( ( aSig<<shiftCount ) != savedASig ) {
3146
        STATUS(float_exception_flags) |= float_flag_inexact;
3147
    }
3148
    return z;
3149

    
3150
}
3151

    
3152
/*----------------------------------------------------------------------------
3153
| Returns the result of converting the extended double-precision floating-
3154
| point value `a' to the 64-bit two's complement integer format.  The
3155
| conversion is performed according to the IEC/IEEE Standard for Binary
3156
| Floating-Point Arithmetic---which means in particular that the conversion
3157
| is rounded according to the current rounding mode.  If `a' is a NaN,
3158
| the largest positive integer is returned.  Otherwise, if the conversion
3159
| overflows, the largest integer with the same sign as `a' is returned.
3160
*----------------------------------------------------------------------------*/
3161

    
3162
int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3163
{
3164
    flag aSign;
3165
    int32 aExp, shiftCount;
3166
    bits64 aSig, aSigExtra;
3167

    
3168
    aSig = extractFloatx80Frac( a );
3169
    aExp = extractFloatx80Exp( a );
3170
    aSign = extractFloatx80Sign( a );
3171
    shiftCount = 0x403E - aExp;
3172
    if ( shiftCount <= 0 ) {
3173
        if ( shiftCount ) {
3174
            float_raise( float_flag_invalid STATUS_VAR);
3175
            if (    ! aSign
3176
                 || (    ( aExp == 0x7FFF )
3177
                      && ( aSig != LIT64( 0x8000000000000000 ) ) )
3178
               ) {
3179
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3180
            }
3181
            return (sbits64) LIT64( 0x8000000000000000 );
3182
        }
3183
        aSigExtra = 0;
3184
    }
3185
    else {
3186
        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3187
    }
3188
    return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3189

    
3190
}
3191

    
3192
/*----------------------------------------------------------------------------
3193
| Returns the result of converting the extended double-precision floating-
3194
| point value `a' to the 64-bit two's complement integer format.  The
3195
| conversion is performed according to the IEC/IEEE Standard for Binary
3196
| Floating-Point Arithmetic, except that the conversion is always rounded
3197
| toward zero.  If `a' is a NaN, the largest positive integer is returned.
3198
| Otherwise, if the conversion overflows, the largest integer with the same
3199
| sign as `a' is returned.
3200
*----------------------------------------------------------------------------*/
3201

    
3202
int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3203
{
3204
    flag aSign;
3205
    int32 aExp, shiftCount;
3206
    bits64 aSig;
3207
    int64 z;
3208

    
3209
    aSig = extractFloatx80Frac( a );
3210
    aExp = extractFloatx80Exp( a );
3211
    aSign = extractFloatx80Sign( a );
3212
    shiftCount = aExp - 0x403E;
3213
    if ( 0 <= shiftCount ) {
3214
        aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3215
        if ( ( a.high != 0xC03E ) || aSig ) {
3216
            float_raise( float_flag_invalid STATUS_VAR);
3217
            if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3218
                return LIT64( 0x7FFFFFFFFFFFFFFF );
3219
            }
3220
        }
3221
        return (sbits64) LIT64( 0x8000000000000000 );
3222
    }
3223
    else if ( aExp < 0x3FFF ) {
3224
        if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3225
        return 0;
3226
    }
3227
    z = aSig>>( - shiftCount );
3228
    if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3229
        STATUS(float_exception_flags) |= float_flag_inexact;
3230
    }
3231
    if ( aSign ) z = - z;
3232
    return z;
3233

    
3234
}
3235

    
3236
/*----------------------------------------------------------------------------
3237
| Returns the result of converting the extended double-precision floating-
3238
| point value `a' to the single-precision floating-point format.  The
3239
| conversion is performed according to the IEC/IEEE Standard for Binary
3240
| Floating-Point Arithmetic.
3241
*----------------------------------------------------------------------------*/
3242

    
3243
float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3244
{
3245
    flag aSign;
3246
    int32 aExp;
3247
    bits64 aSig;
3248

    
3249
    aSig = extractFloatx80Frac( a );
3250
    aExp = extractFloatx80Exp( a );
3251
    aSign = extractFloatx80Sign( a );
3252
    if ( aExp == 0x7FFF ) {
3253
        if ( (bits64) ( aSig<<1 ) ) {
3254
            return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3255
        }
3256
        return packFloat32( aSign, 0xFF, 0 );
3257
    }
3258
    shift64RightJamming( aSig, 33, &aSig );
3259
    if ( aExp || aSig ) aExp -= 0x3F81;
3260
    return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3261

    
3262
}
3263

    
3264
/*----------------------------------------------------------------------------
3265
| Returns the result of converting the extended double-precision floating-
3266
| point value `a' to the double-precision floating-point format.  The
3267
| conversion is performed according to the IEC/IEEE Standard for Binary
3268
| Floating-Point Arithmetic.
3269
*----------------------------------------------------------------------------*/
3270

    
3271
float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3272
{
3273
    flag aSign;
3274
    int32 aExp;
3275
    bits64 aSig, zSig;
3276

    
3277
    aSig = extractFloatx80Frac( a );
3278
    aExp = extractFloatx80Exp( a );
3279
    aSign = extractFloatx80Sign( a );
3280
    if ( aExp == 0x7FFF ) {
3281
        if ( (bits64) ( aSig<<1 ) ) {
3282
            return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3283
        }
3284
        return packFloat64( aSign, 0x7FF, 0 );
3285
    }
3286
    shift64RightJamming( aSig, 1, &zSig );
3287
    if ( aExp || aSig ) aExp -= 0x3C01;
3288
    return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3289

    
3290
}
3291

    
3292
#ifdef FLOAT128
3293

    
3294
/*----------------------------------------------------------------------------
3295
| Returns the result of converting the extended double-precision floating-
3296
| point value `a' to the quadruple-precision floating-point format.  The
3297
| conversion is performed according to the IEC/IEEE Standard for Binary
3298
| Floating-Point Arithmetic.
3299
*----------------------------------------------------------------------------*/
3300

    
3301
float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3302
{
3303
    flag aSign;
3304
    int16 aExp;
3305
    bits64 aSig, zSig0, zSig1;
3306

    
3307
    aSig = extractFloatx80Frac( a );
3308
    aExp = extractFloatx80Exp( a );
3309
    aSign = extractFloatx80Sign( a );
3310
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3311
        return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3312
    }
3313
    shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3314
    return packFloat128( aSign, aExp, zSig0, zSig1 );
3315

    
3316
}
3317

    
3318
#endif
3319

    
3320
/*----------------------------------------------------------------------------
3321
| Rounds the extended double-precision floating-point value `a' to an integer,
3322
| and returns the result as an extended quadruple-precision floating-point
3323
| value.  The operation is performed according to the IEC/IEEE Standard for
3324
| Binary Floating-Point Arithmetic.
3325
*----------------------------------------------------------------------------*/
3326

    
3327
floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3328
{
3329
    flag aSign;
3330
    int32 aExp;
3331
    bits64 lastBitMask, roundBitsMask;
3332
    int8 roundingMode;
3333
    floatx80 z;
3334

    
3335
    aExp = extractFloatx80Exp( a );
3336
    if ( 0x403E <= aExp ) {
3337
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3338
            return propagateFloatx80NaN( a, a STATUS_VAR );
3339
        }
3340
        return a;
3341
    }
3342
    if ( aExp < 0x3FFF ) {
3343
        if (    ( aExp == 0 )
3344
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3345
            return a;
3346
        }
3347
        STATUS(float_exception_flags) |= float_flag_inexact;
3348
        aSign = extractFloatx80Sign( a );
3349
        switch ( STATUS(float_rounding_mode) ) {
3350
         case float_round_nearest_even:
3351
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3352
               ) {
3353
                return
3354
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3355
            }
3356
            break;
3357
         case float_round_down:
3358
            return
3359
                  aSign ?
3360
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3361
                : packFloatx80( 0, 0, 0 );
3362
         case float_round_up:
3363
            return
3364
                  aSign ? packFloatx80( 1, 0, 0 )
3365
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366
        }
3367
        return packFloatx80( aSign, 0, 0 );
3368
    }
3369
    lastBitMask = 1;
3370
    lastBitMask <<= 0x403E - aExp;
3371
    roundBitsMask = lastBitMask - 1;
3372
    z = a;
3373
    roundingMode = STATUS(float_rounding_mode);
3374
    if ( roundingMode == float_round_nearest_even ) {
3375
        z.low += lastBitMask>>1;
3376
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3377
    }
3378
    else if ( roundingMode != float_round_to_zero ) {
3379
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3380
            z.low += roundBitsMask;
3381
        }
3382
    }
3383
    z.low &= ~ roundBitsMask;
3384
    if ( z.low == 0 ) {
3385
        ++z.high;
3386
        z.low = LIT64( 0x8000000000000000 );
3387
    }
3388
    if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3389
    return z;
3390

    
3391
}
3392

    
3393
/*----------------------------------------------------------------------------
3394
| Returns the result of adding the absolute values of the extended double-
3395
| precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3396
| negated before being returned.  `zSign' is ignored if the result is a NaN.
3397
| The addition is performed according to the IEC/IEEE Standard for Binary
3398
| Floating-Point Arithmetic.
3399
*----------------------------------------------------------------------------*/
3400

    
3401
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3402
{
3403
    int32 aExp, bExp, zExp;
3404
    bits64 aSig, bSig, zSig0, zSig1;
3405
    int32 expDiff;
3406

    
3407
    aSig = extractFloatx80Frac( a );
3408
    aExp = extractFloatx80Exp( a );
3409
    bSig = extractFloatx80Frac( b );
3410
    bExp = extractFloatx80Exp( b );
3411
    expDiff = aExp - bExp;
3412
    if ( 0 < expDiff ) {
3413
        if ( aExp == 0x7FFF ) {
3414
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3415
            return a;
3416
        }
3417
        if ( bExp == 0 ) --expDiff;
3418
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3419
        zExp = aExp;
3420
    }
3421
    else if ( expDiff < 0 ) {
3422
        if ( bExp == 0x7FFF ) {
3423
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3424
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3425
        }
3426
        if ( aExp == 0 ) ++expDiff;
3427
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3428
        zExp = bExp;
3429
    }
3430
    else {
3431
        if ( aExp == 0x7FFF ) {
3432
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3433
                return propagateFloatx80NaN( a, b STATUS_VAR );
3434
            }
3435
            return a;
3436
        }
3437
        zSig1 = 0;
3438
        zSig0 = aSig + bSig;
3439
        if ( aExp == 0 ) {
3440
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3441
            goto roundAndPack;
3442
        }
3443
        zExp = aExp;
3444
        goto shiftRight1;
3445
    }
3446
    zSig0 = aSig + bSig;
3447
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3448
 shiftRight1:
3449
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3450
    zSig0 |= LIT64( 0x8000000000000000 );
3451
    ++zExp;
3452
 roundAndPack:
3453
    return
3454
        roundAndPackFloatx80(
3455
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3456

    
3457
}
3458

    
3459
/*----------------------------------------------------------------------------
3460
| Returns the result of subtracting the absolute values of the extended
3461
| double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3462
| difference is negated before being returned.  `zSign' is ignored if the
3463
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
3464
| Standard for Binary Floating-Point Arithmetic.
3465
*----------------------------------------------------------------------------*/
3466

    
3467
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3468
{
3469
    int32 aExp, bExp, zExp;
3470
    bits64 aSig, bSig, zSig0, zSig1;
3471
    int32 expDiff;
3472
    floatx80 z;
3473

    
3474
    aSig = extractFloatx80Frac( a );
3475
    aExp = extractFloatx80Exp( a );
3476
    bSig = extractFloatx80Frac( b );
3477
    bExp = extractFloatx80Exp( b );
3478
    expDiff = aExp - bExp;
3479
    if ( 0 < expDiff ) goto aExpBigger;
3480
    if ( expDiff < 0 ) goto bExpBigger;
3481
    if ( aExp == 0x7FFF ) {
3482
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3483
            return propagateFloatx80NaN( a, b STATUS_VAR );
3484
        }
3485
        float_raise( float_flag_invalid STATUS_VAR);
3486
        z.low = floatx80_default_nan_low;
3487
        z.high = floatx80_default_nan_high;
3488
        return z;
3489
    }
3490
    if ( aExp == 0 ) {
3491
        aExp = 1;
3492
        bExp = 1;
3493
    }
3494
    zSig1 = 0;
3495
    if ( bSig < aSig ) goto aBigger;
3496
    if ( aSig < bSig ) goto bBigger;
3497
    return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3498
 bExpBigger:
3499
    if ( bExp == 0x7FFF ) {
3500
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3501
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3502
    }
3503
    if ( aExp == 0 ) ++expDiff;
3504
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3505
 bBigger:
3506
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3507
    zExp = bExp;
3508
    zSign ^= 1;
3509
    goto normalizeRoundAndPack;
3510
 aExpBigger:
3511
    if ( aExp == 0x7FFF ) {
3512
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3513
        return a;
3514
    }
3515
    if ( bExp == 0 ) --expDiff;
3516
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3517
 aBigger:
3518
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3519
    zExp = aExp;
3520
 normalizeRoundAndPack:
3521
    return
3522
        normalizeRoundAndPackFloatx80(
3523
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3524

    
3525
}
3526

    
3527
/*----------------------------------------------------------------------------
3528
| Returns the result of adding the extended double-precision floating-point
3529
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
3530
| Standard for Binary Floating-Point Arithmetic.
3531
*----------------------------------------------------------------------------*/
3532

    
3533
floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3534
{
3535
    flag aSign, bSign;
3536

    
3537
    aSign = extractFloatx80Sign( a );
3538
    bSign = extractFloatx80Sign( b );
3539
    if ( aSign == bSign ) {
3540
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3541
    }
3542
    else {
3543
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3544
    }
3545

    
3546
}
3547

    
3548
/*----------------------------------------------------------------------------
3549
| Returns the result of subtracting the extended double-precision floating-
3550
| point values `a' and `b'.  The operation is performed according to the
3551
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552
*----------------------------------------------------------------------------*/
3553

    
3554
floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3555
{
3556
    flag aSign, bSign;
3557

    
3558
    aSign = extractFloatx80Sign( a );
3559
    bSign = extractFloatx80Sign( b );
3560
    if ( aSign == bSign ) {
3561
        return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3562
    }
3563
    else {
3564
        return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3565
    }
3566

    
3567
}
3568

    
3569
/*----------------------------------------------------------------------------
3570
| Returns the result of multiplying the extended double-precision floating-
3571
| point values `a' and `b'.  The operation is performed according to the
3572
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3573
*----------------------------------------------------------------------------*/
3574

    
3575
floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3576
{
3577
    flag aSign, bSign, zSign;
3578
    int32 aExp, bExp, zExp;
3579
    bits64 aSig, bSig, zSig0, zSig1;
3580
    floatx80 z;
3581

    
3582
    aSig = extractFloatx80Frac( a );
3583
    aExp = extractFloatx80Exp( a );
3584
    aSign = extractFloatx80Sign( a );
3585
    bSig = extractFloatx80Frac( b );
3586
    bExp = extractFloatx80Exp( b );
3587
    bSign = extractFloatx80Sign( b );
3588
    zSign = aSign ^ bSign;
3589
    if ( aExp == 0x7FFF ) {
3590
        if (    (bits64) ( aSig<<1 )
3591
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3592
            return propagateFloatx80NaN( a, b STATUS_VAR );
3593
        }
3594
        if ( ( bExp | bSig ) == 0 ) goto invalid;
3595
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3596
    }
3597
    if ( bExp == 0x7FFF ) {
3598
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3599
        if ( ( aExp | aSig ) == 0 ) {
3600
 invalid:
3601
            float_raise( float_flag_invalid STATUS_VAR);
3602
            z.low = floatx80_default_nan_low;
3603
            z.high = floatx80_default_nan_high;
3604
            return z;
3605
        }
3606
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607
    }
3608
    if ( aExp == 0 ) {
3609
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3610
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3611
    }
3612
    if ( bExp == 0 ) {
3613
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3614
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3615
    }
3616
    zExp = aExp + bExp - 0x3FFE;
3617
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
3618
    if ( 0 < (sbits64) zSig0 ) {
3619
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3620
        --zExp;
3621
    }
3622
    return
3623
        roundAndPackFloatx80(
3624
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3625

    
3626
}
3627

    
3628
/*----------------------------------------------------------------------------
3629
| Returns the result of dividing the extended double-precision floating-point
3630
| value `a' by the corresponding value `b'.  The operation is performed
3631
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3632
*----------------------------------------------------------------------------*/
3633

    
3634
floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3635
{
3636
    flag aSign, bSign, zSign;
3637
    int32 aExp, bExp, zExp;
3638
    bits64 aSig, bSig, zSig0, zSig1;
3639
    bits64 rem0, rem1, rem2, term0, term1, term2;
3640
    floatx80 z;
3641

    
3642
    aSig = extractFloatx80Frac( a );
3643
    aExp = extractFloatx80Exp( a );
3644
    aSign = extractFloatx80Sign( a );
3645
    bSig = extractFloatx80Frac( b );
3646
    bExp = extractFloatx80Exp( b );
3647
    bSign = extractFloatx80Sign( b );
3648
    zSign = aSign ^ bSign;
3649
    if ( aExp == 0x7FFF ) {
3650
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3651
        if ( bExp == 0x7FFF ) {
3652
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3653
            goto invalid;
3654
        }
3655
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3656
    }
3657
    if ( bExp == 0x7FFF ) {
3658
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3659
        return packFloatx80( zSign, 0, 0 );
3660
    }
3661
    if ( bExp == 0 ) {
3662
        if ( bSig == 0 ) {
3663
            if ( ( aExp | aSig ) == 0 ) {
3664
 invalid:
3665
                float_raise( float_flag_invalid STATUS_VAR);
3666
                z.low = floatx80_default_nan_low;
3667
                z.high = floatx80_default_nan_high;
3668
                return z;
3669
            }
3670
            float_raise( float_flag_divbyzero STATUS_VAR);
3671
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3672
        }
3673
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3674
    }
3675
    if ( aExp == 0 ) {
3676
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3677
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3678
    }
3679
    zExp = aExp - bExp + 0x3FFE;
3680
    rem1 = 0;
3681
    if ( bSig <= aSig ) {
3682
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3683
        ++zExp;
3684
    }
3685
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3686
    mul64To128( bSig, zSig0, &term0, &term1 );
3687
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3688
    while ( (sbits64) rem0 < 0 ) {
3689
        --zSig0;
3690
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3691
    }
3692
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3693
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3694
        mul64To128( bSig, zSig1, &term1, &term2 );
3695
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3696
        while ( (sbits64) rem1 < 0 ) {
3697
            --zSig1;
3698
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3699
        }
3700
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3701
    }
3702
    return
3703
        roundAndPackFloatx80(
3704
            STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3705

    
3706
}
3707

    
3708
/*----------------------------------------------------------------------------
3709
| Returns the remainder of the extended double-precision floating-point value
3710
| `a' with respect to the corresponding value `b'.  The operation is performed
3711
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712
*----------------------------------------------------------------------------*/
3713

    
3714
floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3715
{
3716
    flag aSign, bSign, zSign;
3717
    int32 aExp, bExp, expDiff;
3718
    bits64 aSig0, aSig1, bSig;
3719
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3720
    floatx80 z;
3721

    
3722
    aSig0 = extractFloatx80Frac( a );
3723
    aExp = extractFloatx80Exp( a );
3724
    aSign = extractFloatx80Sign( a );
3725
    bSig = extractFloatx80Frac( b );
3726
    bExp = extractFloatx80Exp( b );
3727
    bSign = extractFloatx80Sign( b );
3728
    if ( aExp == 0x7FFF ) {
3729
        if (    (bits64) ( aSig0<<1 )
3730
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3731
            return propagateFloatx80NaN( a, b STATUS_VAR );
3732
        }
3733
        goto invalid;
3734
    }
3735
    if ( bExp == 0x7FFF ) {
3736
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3737
        return a;
3738
    }
3739
    if ( bExp == 0 ) {
3740
        if ( bSig == 0 ) {
3741
 invalid:
3742
            float_raise( float_flag_invalid STATUS_VAR);
3743
            z.low = floatx80_default_nan_low;
3744
            z.high = floatx80_default_nan_high;
3745
            return z;
3746
        }
3747
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3748
    }
3749
    if ( aExp == 0 ) {
3750
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3751
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3752
    }
3753
    bSig |= LIT64( 0x8000000000000000 );
3754
    zSign = aSign;
3755
    expDiff = aExp - bExp;
3756
    aSig1 = 0;
3757
    if ( expDiff < 0 ) {
3758
        if ( expDiff < -1 ) return a;
3759
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3760
        expDiff = 0;
3761
    }
3762
    q = ( bSig <= aSig0 );
3763
    if ( q ) aSig0 -= bSig;
3764
    expDiff -= 64;
3765
    while ( 0 < expDiff ) {
3766
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3767
        q = ( 2 < q ) ? q - 2 : 0;
3768
        mul64To128( bSig, q, &term0, &term1 );
3769
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3770
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3771
        expDiff -= 62;
3772
    }
3773
    expDiff += 64;
3774
    if ( 0 < expDiff ) {
3775
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3776
        q = ( 2 < q ) ? q - 2 : 0;
3777
        q >>= 64 - expDiff;
3778
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3779
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3780
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3781
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3782
            ++q;
3783
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3784
        }
3785
    }
3786
    else {
3787
        term1 = 0;
3788
        term0 = bSig;
3789
    }
3790
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3791
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3792
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3793
              && ( q & 1 ) )
3794
       ) {
3795
        aSig0 = alternateASig0;
3796
        aSig1 = alternateASig1;
3797
        zSign = ! zSign;
3798
    }
3799
    return
3800
        normalizeRoundAndPackFloatx80(
3801
            80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3802

    
3803
}
3804

    
3805
/*----------------------------------------------------------------------------
3806
| Returns the square root of the extended double-precision floating-point
3807
| value `a'.  The operation is performed according to the IEC/IEEE Standard
3808
| for Binary Floating-Point Arithmetic.
3809
*----------------------------------------------------------------------------*/
3810

    
3811
floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3812
{
3813
    flag aSign;
3814
    int32 aExp, zExp;
3815
    bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3816
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3817
    floatx80 z;
3818

    
3819
    aSig0 = extractFloatx80Frac( a );
3820
    aExp = extractFloatx80Exp( a );
3821
    aSign = extractFloatx80Sign( a );
3822
    if ( aExp == 0x7FFF ) {
3823
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3824
        if ( ! aSign ) return a;
3825
        goto invalid;
3826
    }
3827
    if ( aSign ) {
3828
        if ( ( aExp | aSig0 ) == 0 ) return a;
3829
 invalid:
3830
        float_raise( float_flag_invalid STATUS_VAR);
3831
        z.low = floatx80_default_nan_low;
3832
        z.high = floatx80_default_nan_high;
3833
        return z;
3834
    }
3835
    if ( aExp == 0 ) {
3836
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3837
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3838
    }
3839
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3840
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3841
    shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3842
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3843
    doubleZSig0 = zSig0<<1;
3844
    mul64To128( zSig0, zSig0, &term0, &term1 );
3845
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3846
    while ( (sbits64) rem0 < 0 ) {
3847
        --zSig0;
3848
        doubleZSig0 -= 2;
3849
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3850
    }
3851
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3852
    if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3853
        if ( zSig1 == 0 ) zSig1 = 1;
3854
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3855
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3856
        mul64To128( zSig1, zSig1, &term2, &term3 );
3857
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3858
        while ( (sbits64) rem1 < 0 ) {
3859
            --zSig1;
3860
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3861
            term3 |= 1;
3862
            term2 |= doubleZSig0;
3863
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3864
        }
3865
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3866
    }
3867
    shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3868
    zSig0 |= doubleZSig0;
3869
    return
3870
        roundAndPackFloatx80(
3871
            STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3872

    
3873
}
3874

    
3875
/*----------------------------------------------------------------------------
3876
| Returns 1 if the extended double-precision floating-point value `a' is
3877
| equal to the corresponding value `b', and 0 otherwise.  The comparison is
3878
| performed according to the IEC/IEEE Standard for Binary Floating-Point
3879
| Arithmetic.
3880
*----------------------------------------------------------------------------*/
3881

    
3882
flag floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3883
{
3884

    
3885
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3886
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3887
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3888
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3889
       ) {
3890
        if (    floatx80_is_signaling_nan( a )
3891
             || floatx80_is_signaling_nan( b ) ) {
3892
            float_raise( float_flag_invalid STATUS_VAR);
3893
        }
3894
        return 0;
3895
    }
3896
    return
3897
           ( a.low == b.low )
3898
        && (    ( a.high == b.high )
3899
             || (    ( a.low == 0 )
3900
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3901
           );
3902

    
3903
}
3904

    
3905
/*----------------------------------------------------------------------------
3906
| Returns 1 if the extended double-precision floating-point value `a' is
3907
| less than or equal to the corresponding value `b', and 0 otherwise.  The
3908
| comparison is performed according to the IEC/IEEE Standard for Binary
3909
| Floating-Point Arithmetic.
3910
*----------------------------------------------------------------------------*/
3911

    
3912
flag floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3913
{
3914
    flag aSign, bSign;
3915

    
3916
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3917
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3918
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3919
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3920
       ) {
3921
        float_raise( float_flag_invalid STATUS_VAR);
3922
        return 0;
3923
    }
3924
    aSign = extractFloatx80Sign( a );
3925
    bSign = extractFloatx80Sign( b );
3926
    if ( aSign != bSign ) {
3927
        return
3928
               aSign
3929
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3930
                 == 0 );
3931
    }
3932
    return
3933
          aSign ? le128( b.high, b.low, a.high, a.low )
3934
        : le128( a.high, a.low, b.high, b.low );
3935

    
3936
}
3937

    
3938
/*----------------------------------------------------------------------------
3939
| Returns 1 if the extended double-precision floating-point value `a' is
3940
| less than the corresponding value `b', and 0 otherwise.  The comparison
3941
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3942
| Arithmetic.
3943
*----------------------------------------------------------------------------*/
3944

    
3945
flag floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3946
{
3947
    flag aSign, bSign;
3948

    
3949
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3950
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3951
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3952
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3953
       ) {
3954
        float_raise( float_flag_invalid STATUS_VAR);
3955
        return 0;
3956
    }
3957
    aSign = extractFloatx80Sign( a );
3958
    bSign = extractFloatx80Sign( b );
3959
    if ( aSign != bSign ) {
3960
        return
3961
               aSign
3962
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3963
                 != 0 );
3964
    }
3965
    return
3966
          aSign ? lt128( b.high, b.low, a.high, a.low )
3967
        : lt128( a.high, a.low, b.high, b.low );
3968

    
3969
}
3970

    
3971
/*----------------------------------------------------------------------------
3972
| Returns 1 if the extended double-precision floating-point value `a' is equal
3973
| to the corresponding value `b', and 0 otherwise.  The invalid exception is
3974
| raised if either operand is a NaN.  Otherwise, the comparison is performed
3975
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3976
*----------------------------------------------------------------------------*/
3977

    
3978
flag floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3979
{
3980

    
3981
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3982
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3983
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3984
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3985
       ) {
3986
        float_raise( float_flag_invalid STATUS_VAR);
3987
        return 0;
3988
    }
3989
    return
3990
           ( a.low == b.low )
3991
        && (    ( a.high == b.high )
3992
             || (    ( a.low == 0 )
3993
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3994
           );
3995

    
3996
}
3997

    
3998
/*----------------------------------------------------------------------------
3999
| Returns 1 if the extended double-precision floating-point value `a' is less
4000
| than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4001
| do not cause an exception.  Otherwise, the comparison is performed according
4002
| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4003
*----------------------------------------------------------------------------*/
4004

    
4005
flag floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4006
{
4007
    flag aSign, bSign;
4008

    
4009
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4010
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4011
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4012
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4013
       ) {
4014
        if (    floatx80_is_signaling_nan( a )
4015
             || floatx80_is_signaling_nan( b ) ) {
4016
            float_raise( float_flag_invalid STATUS_VAR);
4017
        }
4018
        return 0;
4019
    }
4020
    aSign = extractFloatx80Sign( a );
4021
    bSign = extractFloatx80Sign( b );
4022
    if ( aSign != bSign ) {
4023
        return
4024
               aSign
4025
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4026
                 == 0 );
4027
    }
4028
    return
4029
          aSign ? le128( b.high, b.low, a.high, a.low )
4030
        : le128( a.high, a.low, b.high, b.low );
4031

    
4032
}
4033

    
4034
/*----------------------------------------------------------------------------
4035
| Returns 1 if the extended double-precision floating-point value `a' is less
4036
| than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4037
| an exception.  Otherwise, the comparison is performed according to the
4038
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4039
*----------------------------------------------------------------------------*/
4040

    
4041
flag floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4042
{
4043
    flag aSign, bSign;
4044

    
4045
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4046
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4047
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4048
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4049
       ) {
4050
        if (    floatx80_is_signaling_nan( a )
4051
             || floatx80_is_signaling_nan( b ) ) {
4052
            float_raise( float_flag_invalid STATUS_VAR);
4053
        }
4054
        return 0;
4055
    }
4056
    aSign = extractFloatx80Sign( a );
4057
    bSign = extractFloatx80Sign( b );
4058
    if ( aSign != bSign ) {
4059
        return
4060
               aSign
4061
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4062
                 != 0 );
4063
    }
4064
    return
4065
          aSign ? lt128( b.high, b.low, a.high, a.low )
4066
        : lt128( a.high, a.low, b.high, b.low );
4067

    
4068
}
4069

    
4070
#endif
4071

    
4072
#ifdef FLOAT128
4073

    
4074
/*----------------------------------------------------------------------------
4075
| Returns the result of converting the quadruple-precision floating-point
4076
| value `a' to the 32-bit two's complement integer format.  The conversion
4077
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4078
| Arithmetic---which means in particular that the conversion is rounded
4079
| according to the current rounding mode.  If `a' is a NaN, the largest
4080
| positive integer is returned.  Otherwise, if the conversion overflows, the
4081
| largest integer with the same sign as `a' is returned.
4082
*----------------------------------------------------------------------------*/
4083

    
4084
int32 float128_to_int32( float128 a STATUS_PARAM )
4085
{
4086
    flag aSign;
4087
    int32 aExp, shiftCount;
4088
    bits64 aSig0, aSig1;
4089

    
4090
    aSig1 = extractFloat128Frac1( a );
4091
    aSig0 = extractFloat128Frac0( a );
4092
    aExp = extractFloat128Exp( a );
4093
    aSign = extractFloat128Sign( a );
4094
    if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4095
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4096
    aSig0 |= ( aSig1 != 0 );
4097
    shiftCount = 0x4028 - aExp;
4098
    if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4099
    return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4100

    
4101
}
4102

    
4103
/*----------------------------------------------------------------------------
4104
| Returns the result of converting the quadruple-precision floating-point
4105
| value `a' to the 32-bit two's complement integer format.  The conversion
4106
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4107
| Arithmetic, except that the conversion is always rounded toward zero.  If
4108
| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4109
| conversion overflows, the largest integer with the same sign as `a' is
4110
| returned.
4111
*----------------------------------------------------------------------------*/
4112

    
4113
int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4114
{
4115
    flag aSign;
4116
    int32 aExp, shiftCount;
4117
    bits64 aSig0, aSig1, savedASig;
4118
    int32 z;
4119

    
4120
    aSig1 = extractFloat128Frac1( a );
4121
    aSig0 = extractFloat128Frac0( a );
4122
    aExp = extractFloat128Exp( a );
4123
    aSign = extractFloat128Sign( a );
4124
    aSig0 |= ( aSig1 != 0 );
4125
    if ( 0x401E < aExp ) {
4126
        if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4127
        goto invalid;
4128
    }
4129
    else if ( aExp < 0x3FFF ) {
4130
        if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4131
        return 0;
4132
    }
4133
    aSig0 |= LIT64( 0x0001000000000000 );
4134
    shiftCount = 0x402F - aExp;
4135
    savedASig = aSig0;
4136
    aSig0 >>= shiftCount;
4137
    z = aSig0;
4138
    if ( aSign ) z = - z;
4139
    if ( ( z < 0 ) ^ aSign ) {
4140
 invalid:
4141
        float_raise( float_flag_invalid STATUS_VAR);
4142
        return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4143
    }
4144
    if ( ( aSig0<<shiftCount ) != savedASig ) {
4145
        STATUS(float_exception_flags) |= float_flag_inexact;
4146
    }
4147
    return z;
4148

    
4149
}
4150

    
4151
/*----------------------------------------------------------------------------
4152
| Returns the result of converting the quadruple-precision floating-point
4153
| value `a' to the 64-bit two's complement integer format.  The conversion
4154
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4155
| Arithmetic---which means in particular that the conversion is rounded
4156
| according to the current rounding mode.  If `a' is a NaN, the largest
4157
| positive integer is returned.  Otherwise, if the conversion overflows, the
4158
| largest integer with the same sign as `a' is returned.
4159
*----------------------------------------------------------------------------*/
4160

    
4161
int64 float128_to_int64( float128 a STATUS_PARAM )
4162
{
4163
    flag aSign;
4164
    int32 aExp, shiftCount;
4165
    bits64 aSig0, aSig1;
4166

    
4167
    aSig1 = extractFloat128Frac1( a );
4168
    aSig0 = extractFloat128Frac0( a );
4169
    aExp = extractFloat128Exp( a );
4170
    aSign = extractFloat128Sign( a );
4171
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4172
    shiftCount = 0x402F - aExp;
4173
    if ( shiftCount <= 0 ) {
4174
        if ( 0x403E < aExp ) {
4175
            float_raise( float_flag_invalid STATUS_VAR);
4176
            if (    ! aSign
4177
                 || (    ( aExp == 0x7FFF )
4178
                      && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4179
                    )
4180
               ) {
4181
                return LIT64( 0x7FFFFFFFFFFFFFFF );
4182
            }
4183
            return (sbits64) LIT64( 0x8000000000000000 );
4184
        }
4185
        shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4186
    }
4187
    else {
4188
        shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4189
    }
4190
    return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4191

    
4192
}
4193

    
4194
/*----------------------------------------------------------------------------
4195
| Returns the result of converting the quadruple-precision floating-point
4196
| value `a' to the 64-bit two's complement integer format.  The conversion
4197
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4198
| Arithmetic, except that the conversion is always rounded toward zero.
4199
| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4200
| the conversion overflows, the largest integer with the same sign as `a' is
4201
| returned.
4202
*----------------------------------------------------------------------------*/
4203

    
4204
int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4205
{
4206
    flag aSign;
4207
    int32 aExp, shiftCount;
4208
    bits64 aSig0, aSig1;
4209
    int64 z;
4210

    
4211
    aSig1 = extractFloat128Frac1( a );
4212
    aSig0 = extractFloat128Frac0( a );
4213
    aExp = extractFloat128Exp( a );
4214
    aSign = extractFloat128Sign( a );
4215
    if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4216
    shiftCount = aExp - 0x402F;
4217
    if ( 0 < shiftCount ) {
4218
        if ( 0x403E <= aExp ) {
4219
            aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4220
            if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4221
                 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4222
                if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4223
            }
4224
            else {
4225
                float_raise( float_flag_invalid STATUS_VAR);
4226
                if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4227
                    return LIT64( 0x7FFFFFFFFFFFFFFF );
4228
                }
4229
            }
4230
            return (sbits64) LIT64( 0x8000000000000000 );
4231
        }
4232
        z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4233
        if ( (bits64) ( aSig1<<shiftCount ) ) {
4234
            STATUS(float_exception_flags) |= float_flag_inexact;
4235
        }
4236
    }
4237
    else {
4238
        if ( aExp < 0x3FFF ) {
4239
            if ( aExp | aSig0 | aSig1 ) {
4240
                STATUS(float_exception_flags) |= float_flag_inexact;
4241
            }
4242
            return 0;
4243
        }
4244
        z = aSig0>>( - shiftCount );
4245
        if (    aSig1
4246
             || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4247
            STATUS(float_exception_flags) |= float_flag_inexact;
4248
        }
4249
    }
4250
    if ( aSign ) z = - z;
4251
    return z;
4252

    
4253
}
4254

    
4255
/*----------------------------------------------------------------------------
4256
| Returns the result of converting the quadruple-precision floating-point
4257
| value `a' to the single-precision floating-point format.  The conversion
4258
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4259
| Arithmetic.
4260
*----------------------------------------------------------------------------*/
4261

    
4262
float32 float128_to_float32( float128 a STATUS_PARAM )
4263
{
4264
    flag aSign;
4265
    int32 aExp;
4266
    bits64 aSig0, aSig1;
4267
    bits32 zSig;
4268

    
4269
    aSig1 = extractFloat128Frac1( a );
4270
    aSig0 = extractFloat128Frac0( a );
4271
    aExp = extractFloat128Exp( a );
4272
    aSign = extractFloat128Sign( a );
4273
    if ( aExp == 0x7FFF ) {
4274
        if ( aSig0 | aSig1 ) {
4275
            return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4276
        }
4277
        return packFloat32( aSign, 0xFF, 0 );
4278
    }
4279
    aSig0 |= ( aSig1 != 0 );
4280
    shift64RightJamming( aSig0, 18, &aSig0 );
4281
    zSig = aSig0;
4282
    if ( aExp || zSig ) {
4283
        zSig |= 0x40000000;
4284
        aExp -= 0x3F81;
4285
    }
4286
    return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4287

    
4288
}
4289

    
4290
/*----------------------------------------------------------------------------
4291
| Returns the result of converting the quadruple-precision floating-point
4292
| value `a' to the double-precision floating-point format.  The conversion
4293
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4294
| Arithmetic.
4295
*----------------------------------------------------------------------------*/
4296

    
4297
float64 float128_to_float64( float128 a STATUS_PARAM )
4298
{
4299
    flag aSign;
4300
    int32 aExp;
4301
    bits64 aSig0, aSig1;
4302

    
4303
    aSig1 = extractFloat128Frac1( a );
4304
    aSig0 = extractFloat128Frac0( a );
4305
    aExp = extractFloat128Exp( a );
4306
    aSign = extractFloat128Sign( a );
4307
    if ( aExp == 0x7FFF ) {
4308
        if ( aSig0 | aSig1 ) {
4309
            return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4310
        }
4311
        return packFloat64( aSign, 0x7FF, 0 );
4312
    }
4313
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4314
    aSig0 |= ( aSig1 != 0 );
4315
    if ( aExp || aSig0 ) {
4316
        aSig0 |= LIT64( 0x4000000000000000 );
4317
        aExp -= 0x3C01;
4318
    }
4319
    return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4320

    
4321
}
4322

    
4323
#ifdef FLOATX80
4324

    
4325
/*----------------------------------------------------------------------------
4326
| Returns the result of converting the quadruple-precision floating-point
4327
| value `a' to the extended double-precision floating-point format.  The
4328
| conversion is performed according to the IEC/IEEE Standard for Binary
4329
| Floating-Point Arithmetic.
4330
*----------------------------------------------------------------------------*/
4331

    
4332
floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4333
{
4334
    flag aSign;
4335
    int32 aExp;
4336
    bits64 aSig0, aSig1;
4337

    
4338
    aSig1 = extractFloat128Frac1( a );
4339
    aSig0 = extractFloat128Frac0( a );
4340
    aExp = extractFloat128Exp( a );
4341
    aSign = extractFloat128Sign( a );
4342
    if ( aExp == 0x7FFF ) {
4343
        if ( aSig0 | aSig1 ) {
4344
            return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4345
        }
4346
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4347
    }
4348
    if ( aExp == 0 ) {
4349
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4350
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4351
    }
4352
    else {
4353
        aSig0 |= LIT64( 0x0001000000000000 );
4354
    }
4355
    shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4356
    return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4357

    
4358
}
4359

    
4360
#endif
4361

    
4362
/*----------------------------------------------------------------------------
4363
| Rounds the quadruple-precision floating-point value `a' to an integer, and
4364
| returns the result as a quadruple-precision floating-point value.  The
4365
| operation is performed according to the IEC/IEEE Standard for Binary
4366
| Floating-Point Arithmetic.
4367
*----------------------------------------------------------------------------*/
4368

    
4369
float128 float128_round_to_int( float128 a STATUS_PARAM )
4370
{
4371
    flag aSign;
4372
    int32 aExp;
4373
    bits64 lastBitMask, roundBitsMask;
4374
    int8 roundingMode;
4375
    float128 z;
4376

    
4377
    aExp = extractFloat128Exp( a );
4378
    if ( 0x402F <= aExp ) {
4379
        if ( 0x406F <= aExp ) {
4380
            if (    ( aExp == 0x7FFF )
4381
                 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4382
               ) {
4383
                return propagateFloat128NaN( a, a STATUS_VAR );
4384
            }
4385
            return a;
4386
        }
4387
        lastBitMask = 1;
4388
        lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4389
        roundBitsMask = lastBitMask - 1;
4390
        z = a;
4391
        roundingMode = STATUS(float_rounding_mode);
4392
        if ( roundingMode == float_round_nearest_even ) {
4393
            if ( lastBitMask ) {
4394
                add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4395
                if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4396
            }
4397
            else {
4398
                if ( (sbits64) z.low < 0 ) {
4399
                    ++z.high;
4400
                    if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4401
                }
4402
            }
4403
        }
4404
        else if ( roundingMode != float_round_to_zero ) {
4405
            if (   extractFloat128Sign( z )
4406
                 ^ ( roundingMode == float_round_up ) ) {
4407
                add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4408
            }
4409
        }
4410
        z.low &= ~ roundBitsMask;
4411
    }
4412
    else {
4413
        if ( aExp < 0x3FFF ) {
4414
            if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4415
            STATUS(float_exception_flags) |= float_flag_inexact;
4416
            aSign = extractFloat128Sign( a );
4417
            switch ( STATUS(float_rounding_mode) ) {
4418
             case float_round_nearest_even:
4419
                if (    ( aExp == 0x3FFE )
4420
                     && (   extractFloat128Frac0( a )
4421
                          | extractFloat128Frac1( a ) )
4422
                   ) {
4423
                    return packFloat128( aSign, 0x3FFF, 0, 0 );
4424
                }
4425
                break;
4426
             case float_round_down:
4427
                return
4428
                      aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4429
                    : packFloat128( 0, 0, 0, 0 );
4430
             case float_round_up:
4431
                return
4432
                      aSign ? packFloat128( 1, 0, 0, 0 )
4433
                    : packFloat128( 0, 0x3FFF, 0, 0 );
4434
            }
4435
            return packFloat128( aSign, 0, 0, 0 );
4436
        }
4437
        lastBitMask = 1;
4438
        lastBitMask <<= 0x402F - aExp;
4439
        roundBitsMask = lastBitMask - 1;
4440
        z.low = 0;
4441
        z.high = a.high;
4442
        roundingMode = STATUS(float_rounding_mode);
4443
        if ( roundingMode == float_round_nearest_even ) {
4444
            z.high += lastBitMask>>1;
4445
            if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4446
                z.high &= ~ lastBitMask;
4447
            }
4448
        }
4449
        else if ( roundingMode != float_round_to_zero ) {
4450
            if (   extractFloat128Sign( z )
4451
                 ^ ( roundingMode == float_round_up ) ) {
4452
                z.high |= ( a.low != 0 );
4453
                z.high += roundBitsMask;
4454
            }
4455
        }
4456
        z.high &= ~ roundBitsMask;
4457
    }
4458
    if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4459
        STATUS(float_exception_flags) |= float_flag_inexact;
4460
    }
4461
    return z;
4462

    
4463
}
4464

    
4465
/*----------------------------------------------------------------------------
4466
| Returns the result of adding the absolute values of the quadruple-precision
4467
| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4468
| before being returned.  `zSign' is ignored if the result is a NaN.
4469
| The addition is performed according to the IEC/IEEE Standard for Binary
4470
| Floating-Point Arithmetic.
4471
*----------------------------------------------------------------------------*/
4472

    
4473
static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4474
{
4475
    int32 aExp, bExp, zExp;
4476
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4477
    int32 expDiff;
4478

    
4479
    aSig1 = extractFloat128Frac1( a );
4480
    aSig0 = extractFloat128Frac0( a );
4481
    aExp = extractFloat128Exp( a );
4482
    bSig1 = extractFloat128Frac1( b );
4483
    bSig0 = extractFloat128Frac0( b );
4484
    bExp = extractFloat128Exp( b );
4485
    expDiff = aExp - bExp;
4486
    if ( 0 < expDiff ) {
4487
        if ( aExp == 0x7FFF ) {
4488
            if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4489
            return a;
4490
        }
4491
        if ( bExp == 0 ) {
4492
            --expDiff;
4493
        }
4494
        else {
4495
            bSig0 |= LIT64( 0x0001000000000000 );
4496
        }
4497
        shift128ExtraRightJamming(
4498
            bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4499
        zExp = aExp;
4500
    }
4501
    else if ( expDiff < 0 ) {
4502
        if ( bExp == 0x7FFF ) {
4503
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4504
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4505
        }
4506
        if ( aExp == 0 ) {
4507
            ++expDiff;
4508
        }
4509
        else {
4510
            aSig0 |= LIT64( 0x0001000000000000 );
4511
        }
4512
        shift128ExtraRightJamming(
4513
            aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4514
        zExp = bExp;
4515
    }
4516
    else {
4517
        if ( aExp == 0x7FFF ) {
4518
            if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4519
                return propagateFloat128NaN( a, b STATUS_VAR );
4520
            }
4521
            return a;
4522
        }
4523
        add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4524
        if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4525
        zSig2 = 0;
4526
        zSig0 |= LIT64( 0x0002000000000000 );
4527
        zExp = aExp;
4528
        goto shiftRight1;
4529
    }
4530
    aSig0 |= LIT64( 0x0001000000000000 );
4531
    add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4532
    --zExp;
4533
    if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4534
    ++zExp;
4535
 shiftRight1:
4536
    shift128ExtraRightJamming(
4537
        zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4538
 roundAndPack:
4539
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4540

    
4541
}
4542

    
4543
/*----------------------------------------------------------------------------
4544
| Returns the result of subtracting the absolute values of the quadruple-
4545
| precision floating-point values `a' and `b'.  If `zSign' is 1, the
4546
| difference is negated before being returned.  `zSign' is ignored if the
4547
| result is a NaN.  The subtraction is performed according to the IEC/IEEE
4548
| Standard for Binary Floating-Point Arithmetic.
4549
*----------------------------------------------------------------------------*/
4550

    
4551
static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4552
{
4553
    int32 aExp, bExp, zExp;
4554
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4555
    int32 expDiff;
4556
    float128 z;
4557

    
4558
    aSig1 = extractFloat128Frac1( a );
4559
    aSig0 = extractFloat128Frac0( a );
4560
    aExp = extractFloat128Exp( a );
4561
    bSig1 = extractFloat128Frac1( b );
4562
    bSig0 = extractFloat128Frac0( b );
4563
    bExp = extractFloat128Exp( b );
4564
    expDiff = aExp - bExp;
4565
    shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4566
    shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4567
    if ( 0 < expDiff ) goto aExpBigger;
4568
    if ( expDiff < 0 ) goto bExpBigger;
4569
    if ( aExp == 0x7FFF ) {
4570
        if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4571
            return propagateFloat128NaN( a, b STATUS_VAR );
4572
        }
4573
        float_raise( float_flag_invalid STATUS_VAR);
4574
        z.low = float128_default_nan_low;
4575
        z.high = float128_default_nan_high;
4576
        return z;
4577
    }
4578
    if ( aExp == 0 ) {
4579
        aExp = 1;
4580
        bExp = 1;
4581
    }
4582
    if ( bSig0 < aSig0 ) goto aBigger;
4583
    if ( aSig0 < bSig0 ) goto bBigger;
4584
    if ( bSig1 < aSig1 ) goto aBigger;
4585
    if ( aSig1 < bSig1 ) goto bBigger;
4586
    return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4587
 bExpBigger:
4588
    if ( bExp == 0x7FFF ) {
4589
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4590
        return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4591
    }
4592
    if ( aExp == 0 ) {
4593
        ++expDiff;
4594
    }
4595
    else {
4596
        aSig0 |= LIT64( 0x4000000000000000 );
4597
    }
4598
    shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4599
    bSig0 |= LIT64( 0x4000000000000000 );
4600
 bBigger:
4601
    sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4602
    zExp = bExp;
4603
    zSign ^= 1;
4604
    goto normalizeRoundAndPack;
4605
 aExpBigger:
4606
    if ( aExp == 0x7FFF ) {
4607
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4608
        return a;
4609
    }
4610
    if ( bExp == 0 ) {
4611
        --expDiff;
4612
    }
4613
    else {
4614
        bSig0 |= LIT64( 0x4000000000000000 );
4615
    }
4616
    shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4617
    aSig0 |= LIT64( 0x4000000000000000 );
4618
 aBigger:
4619
    sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4620
    zExp = aExp;
4621
 normalizeRoundAndPack:
4622
    --zExp;
4623
    return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4624

    
4625
}
4626

    
4627
/*----------------------------------------------------------------------------
4628
| Returns the result of adding the quadruple-precision floating-point values
4629
| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4630
| for Binary Floating-Point Arithmetic.
4631
*----------------------------------------------------------------------------*/
4632

    
4633
float128 float128_add( float128 a, float128 b STATUS_PARAM )
4634
{
4635
    flag aSign, bSign;
4636

    
4637
    aSign = extractFloat128Sign( a );
4638
    bSign = extractFloat128Sign( b );
4639
    if ( aSign == bSign ) {
4640
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4641
    }
4642
    else {
4643
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4644
    }
4645

    
4646
}
4647

    
4648
/*----------------------------------------------------------------------------
4649
| Returns the result of subtracting the quadruple-precision floating-point
4650
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4651
| Standard for Binary Floating-Point Arithmetic.
4652
*----------------------------------------------------------------------------*/
4653

    
4654
float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4655
{
4656
    flag aSign, bSign;
4657

    
4658
    aSign = extractFloat128Sign( a );
4659
    bSign = extractFloat128Sign( b );
4660
    if ( aSign == bSign ) {
4661
        return subFloat128Sigs( a, b, aSign STATUS_VAR );
4662
    }
4663
    else {
4664
        return addFloat128Sigs( a, b, aSign STATUS_VAR );
4665
    }
4666

    
4667
}
4668

    
4669
/*----------------------------------------------------------------------------
4670
| Returns the result of multiplying the quadruple-precision floating-point
4671
| values `a' and `b'.  The operation is performed according to the IEC/IEEE
4672
| Standard for Binary Floating-Point Arithmetic.
4673
*----------------------------------------------------------------------------*/
4674

    
4675
float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4676
{
4677
    flag aSign, bSign, zSign;
4678
    int32 aExp, bExp, zExp;
4679
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4680
    float128 z;
4681

    
4682
    aSig1 = extractFloat128Frac1( a );
4683
    aSig0 = extractFloat128Frac0( a );
4684
    aExp = extractFloat128Exp( a );
4685
    aSign = extractFloat128Sign( a );
4686
    bSig1 = extractFloat128Frac1( b );
4687
    bSig0 = extractFloat128Frac0( b );
4688
    bExp = extractFloat128Exp( b );
4689
    bSign = extractFloat128Sign( b );
4690
    zSign = aSign ^ bSign;
4691
    if ( aExp == 0x7FFF ) {
4692
        if (    ( aSig0 | aSig1 )
4693
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4694
            return propagateFloat128NaN( a, b STATUS_VAR );
4695
        }
4696
        if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4697
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4698
    }
4699
    if ( bExp == 0x7FFF ) {
4700
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4701
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4702
 invalid:
4703
            float_raise( float_flag_invalid STATUS_VAR);
4704
            z.low = float128_default_nan_low;
4705
            z.high = float128_default_nan_high;
4706
            return z;
4707
        }
4708
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4709
    }
4710
    if ( aExp == 0 ) {
4711
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4712
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4713
    }
4714
    if ( bExp == 0 ) {
4715
        if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4716
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4717
    }
4718
    zExp = aExp + bExp - 0x4000;
4719
    aSig0 |= LIT64( 0x0001000000000000 );
4720
    shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4721
    mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4722
    add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4723
    zSig2 |= ( zSig3 != 0 );
4724
    if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4725
        shift128ExtraRightJamming(
4726
            zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4727
        ++zExp;
4728
    }
4729
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4730

    
4731
}
4732

    
4733
/*----------------------------------------------------------------------------
4734
| Returns the result of dividing the quadruple-precision floating-point value
4735
| `a' by the corresponding value `b'.  The operation is performed according to
4736
| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737
*----------------------------------------------------------------------------*/
4738

    
4739
float128 float128_div( float128 a, float128 b STATUS_PARAM )
4740
{
4741
    flag aSign, bSign, zSign;
4742
    int32 aExp, bExp, zExp;
4743
    bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4744
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4745
    float128 z;
4746

    
4747
    aSig1 = extractFloat128Frac1( a );
4748
    aSig0 = extractFloat128Frac0( a );
4749
    aExp = extractFloat128Exp( a );
4750
    aSign = extractFloat128Sign( a );
4751
    bSig1 = extractFloat128Frac1( b );
4752
    bSig0 = extractFloat128Frac0( b );
4753
    bExp = extractFloat128Exp( b );
4754
    bSign = extractFloat128Sign( b );
4755
    zSign = aSign ^ bSign;
4756
    if ( aExp == 0x7FFF ) {
4757
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4758
        if ( bExp == 0x7FFF ) {
4759
            if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4760
            goto invalid;
4761
        }
4762
        return packFloat128( zSign, 0x7FFF, 0, 0 );
4763
    }
4764
    if ( bExp == 0x7FFF ) {
4765
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4766
        return packFloat128( zSign, 0, 0, 0 );
4767
    }
4768
    if ( bExp == 0 ) {
4769
        if ( ( bSig0 | bSig1 ) == 0 ) {
4770
            if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4771
 invalid:
4772
                float_raise( float_flag_invalid STATUS_VAR);
4773
                z.low = float128_default_nan_low;
4774
                z.high = float128_default_nan_high;
4775
                return z;
4776
            }
4777
            float_raise( float_flag_divbyzero STATUS_VAR);
4778
            return packFloat128( zSign, 0x7FFF, 0, 0 );
4779
        }
4780
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4781
    }
4782
    if ( aExp == 0 ) {
4783
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4784
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4785
    }
4786
    zExp = aExp - bExp + 0x3FFD;
4787
    shortShift128Left(
4788
        aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4789
    shortShift128Left(
4790
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4791
    if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4792
        shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4793
        ++zExp;
4794
    }
4795
    zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4796
    mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4797
    sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4798
    while ( (sbits64) rem0 < 0 ) {
4799
        --zSig0;
4800
        add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4801
    }
4802
    zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4803
    if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4804
        mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4805
        sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4806
        while ( (sbits64) rem1 < 0 ) {
4807
            --zSig1;
4808
            add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4809
        }
4810
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4811
    }
4812
    shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4813
    return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4814

    
4815
}
4816

    
4817
/*----------------------------------------------------------------------------
4818
| Returns the remainder of the quadruple-precision floating-point value `a'
4819
| with respect to the corresponding value `b'.  The operation is performed
4820
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4821
*----------------------------------------------------------------------------*/
4822

    
4823
float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4824
{
4825
    flag aSign, bSign, zSign;
4826
    int32 aExp, bExp, expDiff;
4827
    bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4828
    bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4829
    sbits64 sigMean0;
4830
    float128 z;
4831

    
4832
    aSig1 = extractFloat128Frac1( a );
4833
    aSig0 = extractFloat128Frac0( a );
4834
    aExp = extractFloat128Exp( a );
4835
    aSign = extractFloat128Sign( a );
4836
    bSig1 = extractFloat128Frac1( b );
4837
    bSig0 = extractFloat128Frac0( b );
4838
    bExp = extractFloat128Exp( b );
4839
    bSign = extractFloat128Sign( b );
4840
    if ( aExp == 0x7FFF ) {
4841
        if (    ( aSig0 | aSig1 )
4842
             || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4843
            return propagateFloat128NaN( a, b STATUS_VAR );
4844
        }
4845
        goto invalid;
4846
    }
4847
    if ( bExp == 0x7FFF ) {
4848
        if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4849
        return a;
4850
    }
4851
    if ( bExp == 0 ) {
4852
        if ( ( bSig0 | bSig1 ) == 0 ) {
4853
 invalid:
4854
            float_raise( float_flag_invalid STATUS_VAR);
4855
            z.low = float128_default_nan_low;
4856
            z.high = float128_default_nan_high;
4857
            return z;
4858
        }
4859
        normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4860
    }
4861
    if ( aExp == 0 ) {
4862
        if ( ( aSig0 | aSig1 ) == 0 ) return a;
4863
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4864
    }
4865
    expDiff = aExp - bExp;
4866
    if ( expDiff < -1 ) return a;
4867
    shortShift128Left(
4868
        aSig0 | LIT64( 0x0001000000000000 ),
4869
        aSig1,
4870
        15 - ( expDiff < 0 ),
4871
        &aSig0,
4872
        &aSig1
4873
    );
4874
    shortShift128Left(
4875
        bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4876
    q = le128( bSig0, bSig1, aSig0, aSig1 );
4877
    if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4878
    expDiff -= 64;
4879
    while ( 0 < expDiff ) {
4880
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4881
        q = ( 4 < q ) ? q - 4 : 0;
4882
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4883
        shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4884
        shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4885
        sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4886
        expDiff -= 61;
4887
    }
4888
    if ( -64 < expDiff ) {
4889
        q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4890
        q = ( 4 < q ) ? q - 4 : 0;
4891
        q >>= - expDiff;
4892
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4893
        expDiff += 52;
4894
        if ( expDiff < 0 ) {
4895
            shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896
        }
4897
        else {
4898
            shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4899
        }
4900
        mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4901
        sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4902
    }
4903
    else {
4904
        shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4905
        shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4906
    }
4907
    do {
4908
        alternateASig0 = aSig0;
4909
        alternateASig1 = aSig1;
4910
        ++q;
4911
        sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4912
    } while ( 0 <= (sbits64) aSig0 );
4913
    add128(
4914
        aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4915
    if (    ( sigMean0 < 0 )
4916
         || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4917
        aSig0 = alternateASig0;
4918
        aSig1 = alternateASig1;
4919
    }
4920
    zSign = ( (sbits64) aSig0 < 0 );
4921
    if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4922
    return
4923
        normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4924

    
4925
}
4926

    
4927
/*----------------------------------------------------------------------------
4928
| Returns the square root of the quadruple-precision floating-point value `a'.
4929
| The operation is performed according to the IEC/IEEE Standard for Binary
4930
| Floating-Point Arithmetic.
4931
*----------------------------------------------------------------------------*/
4932

    
4933
float128 float128_sqrt( float128 a STATUS_PARAM )
4934
{
4935
    flag aSign;
4936
    int32 aExp, zExp;
4937
    bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4938
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4939
    float128 z;
4940

    
4941
    aSig1 = extractFloat128Frac1( a );
4942
    aSig0 = extractFloat128Frac0( a );
4943
    aExp = extractFloat128Exp( a );
4944
    aSign = extractFloat128Sign( a );
4945
    if ( aExp == 0x7FFF ) {
4946
        if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4947
        if ( ! aSign ) return a;
4948
        goto invalid;
4949
    }
4950
    if ( aSign ) {
4951
        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4952
 invalid:
4953
        float_raise( float_flag_invalid STATUS_VAR);
4954
        z.low = float128_default_nan_low;
4955
        z.high = float128_default_nan_high;
4956
        return z;
4957
    }
4958
    if ( aExp == 0 ) {
4959
        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4960
        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4961
    }
4962
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4963
    aSig0 |= LIT64( 0x0001000000000000 );
4964
    zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4965
    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4966
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4967
    doubleZSig0 = zSig0<<1;
4968
    mul64To128( zSig0, zSig0, &term0, &term1 );
4969
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4970
    while ( (sbits64) rem0 < 0 ) {
4971
        --zSig0;
4972
        doubleZSig0 -= 2;
4973
        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4974
    }
4975
    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4976
    if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4977
        if ( zSig1 == 0 ) zSig1 = 1;
4978
        mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4979
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4980
        mul64To128( zSig1, zSig1, &term2, &term3 );
4981
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4982
        while ( (sbits64) rem1 < 0 ) {
4983
            --zSig1;
4984
            shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4985
            term3 |= 1;
4986
            term2 |= doubleZSig0;
4987
            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4988
        }
4989
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4990
    }
4991
    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4992
    return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4993

    
4994
}
4995

    
4996
/*----------------------------------------------------------------------------
4997
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
4998
| the corresponding value `b', and 0 otherwise.  The comparison is performed
4999
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5000
*----------------------------------------------------------------------------*/
5001

    
5002
flag float128_eq( float128 a, float128 b STATUS_PARAM )
5003
{
5004

    
5005
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5006
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5007
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5008
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5009
       ) {
5010
        if (    float128_is_signaling_nan( a )
5011
             || float128_is_signaling_nan( b ) ) {
5012
            float_raise( float_flag_invalid STATUS_VAR);
5013
        }
5014
        return 0;
5015
    }
5016
    return
5017
           ( a.low == b.low )
5018
        && (    ( a.high == b.high )
5019
             || (    ( a.low == 0 )
5020
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5021
           );
5022

    
5023
}
5024

    
5025
/*----------------------------------------------------------------------------
5026
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5027
| or equal to the corresponding value `b', and 0 otherwise.  The comparison
5028
| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029
| Arithmetic.
5030
*----------------------------------------------------------------------------*/
5031

    
5032
flag float128_le( float128 a, float128 b STATUS_PARAM )
5033
{
5034
    flag aSign, bSign;
5035

    
5036
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5037
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5038
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5039
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5040
       ) {
5041
        float_raise( float_flag_invalid STATUS_VAR);
5042
        return 0;
5043
    }
5044
    aSign = extractFloat128Sign( a );
5045
    bSign = extractFloat128Sign( b );
5046
    if ( aSign != bSign ) {
5047
        return
5048
               aSign
5049
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5050
                 == 0 );
5051
    }
5052
    return
5053
          aSign ? le128( b.high, b.low, a.high, a.low )
5054
        : le128( a.high, a.low, b.high, b.low );
5055

    
5056
}
5057

    
5058
/*----------------------------------------------------------------------------
5059
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5060
| the corresponding value `b', and 0 otherwise.  The comparison is performed
5061
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062
*----------------------------------------------------------------------------*/
5063

    
5064
flag float128_lt( float128 a, float128 b STATUS_PARAM )
5065
{
5066
    flag aSign, bSign;
5067

    
5068
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5069
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5070
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5071
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5072
       ) {
5073
        float_raise( float_flag_invalid STATUS_VAR);
5074
        return 0;
5075
    }
5076
    aSign = extractFloat128Sign( a );
5077
    bSign = extractFloat128Sign( b );
5078
    if ( aSign != bSign ) {
5079
        return
5080
               aSign
5081
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5082
                 != 0 );
5083
    }
5084
    return
5085
          aSign ? lt128( b.high, b.low, a.high, a.low )
5086
        : lt128( a.high, a.low, b.high, b.low );
5087

    
5088
}
5089

    
5090
/*----------------------------------------------------------------------------
5091
| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5092
| the corresponding value `b', and 0 otherwise.  The invalid exception is
5093
| raised if either operand is a NaN.  Otherwise, the comparison is performed
5094
| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5095
*----------------------------------------------------------------------------*/
5096

    
5097
flag float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5098
{
5099

    
5100
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5101
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5102
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5103
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5104
       ) {
5105
        float_raise( float_flag_invalid STATUS_VAR);
5106
        return 0;
5107
    }
5108
    return
5109
           ( a.low == b.low )
5110
        && (    ( a.high == b.high )
5111
             || (    ( a.low == 0 )
5112
                  && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5113
           );
5114

    
5115
}
5116

    
5117
/*----------------------------------------------------------------------------
5118
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5119
| or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5120
| cause an exception.  Otherwise, the comparison is performed according to the
5121
| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5122
*----------------------------------------------------------------------------*/
5123

    
5124
flag float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5125
{
5126
    flag aSign, bSign;
5127

    
5128
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5129
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5130
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5131
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5132
       ) {
5133
        if (    float128_is_signaling_nan( a )
5134
             || float128_is_signaling_nan( b ) ) {
5135
            float_raise( float_flag_invalid STATUS_VAR);
5136
        }
5137
        return 0;
5138
    }
5139
    aSign = extractFloat128Sign( a );
5140
    bSign = extractFloat128Sign( b );
5141
    if ( aSign != bSign ) {
5142
        return
5143
               aSign
5144
            || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5145
                 == 0 );
5146
    }
5147
    return
5148
          aSign ? le128( b.high, b.low, a.high, a.low )
5149
        : le128( a.high, a.low, b.high, b.low );
5150

    
5151
}
5152

    
5153
/*----------------------------------------------------------------------------
5154
| Returns 1 if the quadruple-precision floating-point value `a' is less than
5155
| the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5156
| exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5157
| Standard for Binary Floating-Point Arithmetic.
5158
*----------------------------------------------------------------------------*/
5159

    
5160
flag float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5161
{
5162
    flag aSign, bSign;
5163

    
5164
    if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5165
              && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5166
         || (    ( extractFloat128Exp( b ) == 0x7FFF )
5167
              && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5168
       ) {
5169
        if (    float128_is_signaling_nan( a )
5170
             || float128_is_signaling_nan( b ) ) {
5171
            float_raise( float_flag_invalid STATUS_VAR);
5172
        }
5173
        return 0;
5174
    }
5175
    aSign = extractFloat128Sign( a );
5176
    bSign = extractFloat128Sign( b );
5177
    if ( aSign != bSign ) {
5178
        return
5179
               aSign
5180
            && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5181
                 != 0 );
5182
    }
5183
    return
5184
          aSign ? lt128( b.high, b.low, a.high, a.low )
5185
        : lt128( a.high, a.low, b.high, b.low );
5186

    
5187
}
5188

    
5189
#endif
5190

    
5191
/* misc functions */
5192
float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5193
{
5194
    return int64_to_float32(a STATUS_VAR);
5195
}
5196

    
5197
float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5198
{
5199
    return int64_to_float64(a STATUS_VAR);
5200
}
5201

    
5202
unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5203
{
5204
    int64_t v;
5205
    unsigned int res;
5206

    
5207
    v = float32_to_int64(a STATUS_VAR);
5208
    if (v < 0) {
5209
        res = 0;
5210
        float_raise( float_flag_invalid STATUS_VAR);
5211
    } else if (v > 0xffffffff) {
5212
        res = 0xffffffff;
5213
        float_raise( float_flag_invalid STATUS_VAR);
5214
    } else {
5215
        res = v;
5216
    }
5217
    return res;
5218
}
5219

    
5220
unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5221
{
5222
    int64_t v;
5223
    unsigned int res;
5224

    
5225
    v = float32_to_int64_round_to_zero(a STATUS_VAR);
5226
    if (v < 0) {
5227
        res = 0;
5228
        float_raise( float_flag_invalid STATUS_VAR);
5229
    } else if (v > 0xffffffff) {
5230
        res = 0xffffffff;
5231
        float_raise( float_flag_invalid STATUS_VAR);
5232
    } else {
5233
        res = v;
5234
    }
5235
    return res;
5236
}
5237

    
5238
unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5239
{
5240
    int64_t v;
5241
    unsigned int res;
5242

    
5243
    v = float64_to_int64(a STATUS_VAR);
5244
    if (v < 0) {
5245
        res = 0;
5246
        float_raise( float_flag_invalid STATUS_VAR);
5247
    } else if (v > 0xffffffff) {
5248
        res = 0xffffffff;
5249
        float_raise( float_flag_invalid STATUS_VAR);
5250
    } else {
5251
        res = v;
5252
    }
5253
    return res;
5254
}
5255

    
5256
unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5257
{
5258
    int64_t v;
5259
    unsigned int res;
5260

    
5261
    v = float64_to_int64_round_to_zero(a STATUS_VAR);
5262
    if (v < 0) {
5263
        res = 0;
5264
        float_raise( float_flag_invalid STATUS_VAR);
5265
    } else if (v > 0xffffffff) {
5266
        res = 0xffffffff;
5267
        float_raise( float_flag_invalid STATUS_VAR);
5268
    } else {
5269
        res = v;
5270
    }
5271
    return res;
5272
}
5273

    
5274
#define COMPARE(s, nan_exp)                                                  \
5275
INLINE char float ## s ## _compare_internal( float ## s a, float ## s b,     \
5276
                                      int is_quiet STATUS_PARAM )            \
5277
{                                                                            \
5278
    flag aSign, bSign;                                                       \
5279
                                                                             \
5280
    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5281
         extractFloat ## s ## Frac( a ) ) ||                                 \
5282
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5283
          extractFloat ## s ## Frac( b ) )) {                                \
5284
        if (!is_quiet ||                                                     \
5285
            float ## s ## _is_signaling_nan( a ) ||                          \
5286
            float ## s ## _is_signaling_nan( b ) ) {                         \
5287
            float_raise( float_flag_invalid STATUS_VAR);                     \
5288
        }                                                                    \
5289
        return float_relation_unordered;                                     \
5290
    }                                                                        \
5291
    aSign = extractFloat ## s ## Sign( a );                                  \
5292
    bSign = extractFloat ## s ## Sign( b );                                  \
5293
    if ( aSign != bSign ) {                                                  \
5294
        if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) {                           \
5295
            /* zero case */                                                  \
5296
            return float_relation_equal;                                     \
5297
        } else {                                                             \
5298
            return 1 - (2 * aSign);                                          \
5299
        }                                                                    \
5300
    } else {                                                                 \
5301
        if (a == b) {                                                        \
5302
            return float_relation_equal;                                     \
5303
        } else {                                                             \
5304
            return 1 - 2 * (aSign ^ ( a < b ));                              \
5305
        }                                                                    \
5306
    }                                                                        \
5307
}                                                                            \
5308
                                                                             \
5309
char float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )       \
5310
{                                                                            \
5311
    return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5312
}                                                                            \
5313
                                                                             \
5314
char float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5315
{                                                                            \
5316
    return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5317
}
5318

    
5319
COMPARE(32, 0xff)
5320
COMPARE(64, 0x7ff)