Statistics
| Branch: | Revision:

root / target-arm / nwfpe / softfloat.c @ 157777ef

History | View | Annotate | Download (114.7 kB)

1
/*
2
===============================================================================
3

4
This C source file is part of the SoftFloat IEC/IEEE Floating-point
5
Arithmetic Package, Release 2.
6

7
Written by John R. Hauser.  This work was made possible in part by the
8
International Computer Science Institute, located at Suite 600, 1947 Center
9
Street, Berkeley, California 94704.  Funding was partially provided by the
10
National Science Foundation under grant MIP-9311980.  The original version
11
of this code was written as part of a project to build a fixed-point vector
12
processor in collaboration with the University of California at Berkeley,
13
overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14
is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15
arithmetic/softfloat.html'.
16

17
THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
18
has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19
TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
20
PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21
AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22

23
Derivative works are acceptable, even for commercial purposes, so long as
24
(1) they include prominent notice that the work is derivative, and (2) they
25
include prominent notice akin to these three paragraphs for those parts of
26
this code that are retained.
27

28
===============================================================================
29
*/
30

    
31
#include "fpa11.h"
32
#include "milieu.h"
33
#include "softfloat.h"
34

    
35
/*
36
-------------------------------------------------------------------------------
37
Floating-point rounding mode, extended double-precision rounding precision,
38
and exception flags.
39
-------------------------------------------------------------------------------
40
*/
41
int8 float_rounding_mode = float_round_nearest_even;
42
int8 floatx80_rounding_precision = 80;
43
int8 float_exception_flags;
44

    
45
/*
46
-------------------------------------------------------------------------------
47
Primitive arithmetic functions, including multi-word arithmetic, and
48
division and square root approximations.  (Can be specialized to target if
49
desired.)
50
-------------------------------------------------------------------------------
51
*/
52
#include "softfloat-macros"
53

    
54
/*
55
-------------------------------------------------------------------------------
56
Functions and definitions to determine:  (1) whether tininess for underflow
57
is detected before or after rounding by default, (2) what (if anything)
58
happens when exceptions are raised, (3) how signaling NaNs are distinguished
59
from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60
are propagated from function inputs to output.  These details are target-
61
specific.
62
-------------------------------------------------------------------------------
63
*/
64
#include "softfloat-specialize"
65

    
66
/*
67
-------------------------------------------------------------------------------
68
Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69
and 7, and returns the properly rounded 32-bit integer corresponding to the
70
input.  If `zSign' is nonzero, the input is negated before being converted
71
to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
72
input is simply rounded to an integer, with the inexact exception raised if
73
the input cannot be represented exactly as an integer.  If the fixed-point
74
input is too large, however, the invalid exception is raised and the largest
75
positive or negative integer is returned.
76
-------------------------------------------------------------------------------
77
*/
78
static int32 roundAndPackInt32( flag zSign, bits64 absZ )
79
{
80
    int8 roundingMode;
81
    flag roundNearestEven;
82
    int8 roundIncrement, roundBits;
83
    int32 z;
84

    
85
    roundingMode = float_rounding_mode;
86
    roundNearestEven = ( roundingMode == float_round_nearest_even );
87
    roundIncrement = 0x40;
88
    if ( ! roundNearestEven ) {
89
        if ( roundingMode == float_round_to_zero ) {
90
            roundIncrement = 0;
91
        }
92
        else {
93
            roundIncrement = 0x7F;
94
            if ( zSign ) {
95
                if ( roundingMode == float_round_up ) roundIncrement = 0;
96
            }
97
            else {
98
                if ( roundingMode == float_round_down ) roundIncrement = 0;
99
            }
100
        }
101
    }
102
    roundBits = absZ & 0x7F;
103
    absZ = ( absZ + roundIncrement )>>7;
104
    absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
105
    z = absZ;
106
    if ( zSign ) z = - z;
107
    if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108
        float_exception_flags |= float_flag_invalid;
109
        return zSign ? 0x80000000 : 0x7FFFFFFF;
110
    }
111
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
112
    return z;
113

    
114
}
115

    
116
/*
117
-------------------------------------------------------------------------------
118
Returns the fraction bits of the single-precision floating-point value `a'.
119
-------------------------------------------------------------------------------
120
*/
121
INLINE bits32 extractFloat32Frac( float32 a )
122
{
123

    
124
    return a & 0x007FFFFF;
125

    
126
}
127

    
128
/*
129
-------------------------------------------------------------------------------
130
Returns the exponent bits of the single-precision floating-point value `a'.
131
-------------------------------------------------------------------------------
132
*/
133
INLINE int16 extractFloat32Exp( float32 a )
134
{
135

    
136
    return ( a>>23 ) & 0xFF;
137

    
138
}
139

    
140
/*
141
-------------------------------------------------------------------------------
142
Returns the sign bit of the single-precision floating-point value `a'.
143
-------------------------------------------------------------------------------
144
*/
145
INLINE flag extractFloat32Sign( float32 a )
146
{
147

    
148
    return a>>31;
149

    
150
}
151

    
152
/*
153
-------------------------------------------------------------------------------
154
Normalizes the subnormal single-precision floating-point value represented
155
by the denormalized significand `aSig'.  The normalized exponent and
156
significand are stored at the locations pointed to by `zExpPtr' and
157
`zSigPtr', respectively.
158
-------------------------------------------------------------------------------
159
*/
160
static void
161
 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
162
{
163
    int8 shiftCount;
164

    
165
    shiftCount = countLeadingZeros32( aSig ) - 8;
166
    *zSigPtr = aSig<<shiftCount;
167
    *zExpPtr = 1 - shiftCount;
168

    
169
}
170

    
171
/*
172
-------------------------------------------------------------------------------
173
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
174
single-precision floating-point value, returning the result.  After being
175
shifted into the proper positions, the three fields are simply added
176
together to form the result.  This means that any integer portion of `zSig'
177
will be added into the exponent.  Since a properly normalized significand
178
will have an integer portion equal to 1, the `zExp' input should be 1 less
179
than the desired result exponent whenever `zSig' is a complete, normalized
180
significand.
181
-------------------------------------------------------------------------------
182
*/
183
INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
184
{
185
    return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
186
}
187

    
188
/*
189
-------------------------------------------------------------------------------
190
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
191
and significand `zSig', and returns the proper single-precision floating-
192
point value corresponding to the abstract input.  Ordinarily, the abstract
193
value is simply rounded and packed into the single-precision format, with
194
the inexact exception raised if the abstract input cannot be represented
195
exactly.  If the abstract value is too large, however, the overflow and
196
inexact exceptions are raised and an infinity or maximal finite value is
197
returned.  If the abstract value is too small, the input value is rounded to
198
a subnormal number, and the underflow and inexact exceptions are raised if
199
the abstract input cannot be represented exactly as a subnormal single-
200
precision floating-point number.
201
    The input significand `zSig' has its binary point between bits 30
202
and 29, which is 7 bits to the left of the usual location.  This shifted
203
significand must be normalized or smaller.  If `zSig' is not normalized,
204
`zExp' must be 0; in that case, the result returned is a subnormal number,
205
and it must not require rounding.  In the usual case that `zSig' is
206
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
207
The handling of underflow and overflow follows the IEC/IEEE Standard for
208
Binary Floating-point Arithmetic.
209
-------------------------------------------------------------------------------
210
*/
211
static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
212
{
213
    int8 roundingMode;
214
    flag roundNearestEven;
215
    int8 roundIncrement, roundBits;
216
    flag isTiny;
217

    
218
    roundingMode = float_rounding_mode;
219
    roundNearestEven = ( roundingMode == float_round_nearest_even );
220
    roundIncrement = 0x40;
221
    if ( ! roundNearestEven ) {
222
        if ( roundingMode == float_round_to_zero ) {
223
            roundIncrement = 0;
224
        }
225
        else {
226
            roundIncrement = 0x7F;
227
            if ( zSign ) {
228
                if ( roundingMode == float_round_up ) roundIncrement = 0;
229
            }
230
            else {
231
                if ( roundingMode == float_round_down ) roundIncrement = 0;
232
            }
233
        }
234
    }
235
    roundBits = zSig & 0x7F;
236
    if ( 0xFD <= (bits16) zExp ) {
237
        if (    ( 0xFD < zExp )
238
             || (    ( zExp == 0xFD )
239
                  && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
240
           ) {
241
            float_raise( float_flag_overflow | float_flag_inexact );
242
            return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
243
        }
244
        if ( zExp < 0 ) {
245
            isTiny =
246
                   ( float_detect_tininess == float_tininess_before_rounding )
247
                || ( zExp < -1 )
248
                || ( zSig + roundIncrement < 0x80000000 );
249
            shift32RightJamming( zSig, - zExp, &zSig );
250
            zExp = 0;
251
            roundBits = zSig & 0x7F;
252
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
253
        }
254
    }
255
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
256
    zSig = ( zSig + roundIncrement )>>7;
257
    zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
258
    if ( zSig == 0 ) zExp = 0;
259
    return packFloat32( zSign, zExp, zSig );
260

    
261
}
262

    
263
/*
264
-------------------------------------------------------------------------------
265
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
266
and significand `zSig', and returns the proper single-precision floating-
267
point value corresponding to the abstract input.  This routine is just like
268
`roundAndPackFloat32' except that `zSig' does not have to be normalized in
269
any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
270
point exponent.
271
-------------------------------------------------------------------------------
272
*/
273
static float32
274
 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
275
{
276
    int8 shiftCount;
277

    
278
    shiftCount = countLeadingZeros32( zSig ) - 1;
279
    return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
280

    
281
}
282

    
283
/*
284
-------------------------------------------------------------------------------
285
Returns the fraction bits of the double-precision floating-point value `a'.
286
-------------------------------------------------------------------------------
287
*/
288
INLINE bits64 extractFloat64Frac( float64 a )
289
{
290

    
291
    return a & LIT64( 0x000FFFFFFFFFFFFF );
292

    
293
}
294

    
295
/*
296
-------------------------------------------------------------------------------
297
Returns the exponent bits of the double-precision floating-point value `a'.
298
-------------------------------------------------------------------------------
299
*/
300
INLINE int16 extractFloat64Exp( float64 a )
301
{
302

    
303
    return ( a>>52 ) & 0x7FF;
304

    
305
}
306

    
307
/*
308
-------------------------------------------------------------------------------
309
Returns the sign bit of the double-precision floating-point value `a'.
310
-------------------------------------------------------------------------------
311
*/
312
INLINE flag extractFloat64Sign( float64 a )
313
{
314

    
315
    return a>>63;
316

    
317
}
318

    
319
/*
320
-------------------------------------------------------------------------------
321
Normalizes the subnormal double-precision floating-point value represented
322
by the denormalized significand `aSig'.  The normalized exponent and
323
significand are stored at the locations pointed to by `zExpPtr' and
324
`zSigPtr', respectively.
325
-------------------------------------------------------------------------------
326
*/
327
static void
328
 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
329
{
330
    int8 shiftCount;
331

    
332
    shiftCount = countLeadingZeros64( aSig ) - 11;
333
    *zSigPtr = aSig<<shiftCount;
334
    *zExpPtr = 1 - shiftCount;
335

    
336
}
337

    
338
/*
339
-------------------------------------------------------------------------------
340
Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
341
double-precision floating-point value, returning the result.  After being
342
shifted into the proper positions, the three fields are simply added
343
together to form the result.  This means that any integer portion of `zSig'
344
will be added into the exponent.  Since a properly normalized significand
345
will have an integer portion equal to 1, the `zExp' input should be 1 less
346
than the desired result exponent whenever `zSig' is a complete, normalized
347
significand.
348
-------------------------------------------------------------------------------
349
*/
350
INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
351
{
352

    
353
    return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
354

    
355
}
356

    
357
/*
358
-------------------------------------------------------------------------------
359
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360
and significand `zSig', and returns the proper double-precision floating-
361
point value corresponding to the abstract input.  Ordinarily, the abstract
362
value is simply rounded and packed into the double-precision format, with
363
the inexact exception raised if the abstract input cannot be represented
364
exactly.  If the abstract value is too large, however, the overflow and
365
inexact exceptions are raised and an infinity or maximal finite value is
366
returned.  If the abstract value is too small, the input value is rounded to
367
a subnormal number, and the underflow and inexact exceptions are raised if
368
the abstract input cannot be represented exactly as a subnormal double-
369
precision floating-point number.
370
    The input significand `zSig' has its binary point between bits 62
371
and 61, which is 10 bits to the left of the usual location.  This shifted
372
significand must be normalized or smaller.  If `zSig' is not normalized,
373
`zExp' must be 0; in that case, the result returned is a subnormal number,
374
and it must not require rounding.  In the usual case that `zSig' is
375
normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
376
The handling of underflow and overflow follows the IEC/IEEE Standard for
377
Binary Floating-point Arithmetic.
378
-------------------------------------------------------------------------------
379
*/
380
static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
381
{
382
    int8 roundingMode;
383
    flag roundNearestEven;
384
    int16 roundIncrement, roundBits;
385
    flag isTiny;
386

    
387
    roundingMode = float_rounding_mode;
388
    roundNearestEven = ( roundingMode == float_round_nearest_even );
389
    roundIncrement = 0x200;
390
    if ( ! roundNearestEven ) {
391
        if ( roundingMode == float_round_to_zero ) {
392
            roundIncrement = 0;
393
        }
394
        else {
395
            roundIncrement = 0x3FF;
396
            if ( zSign ) {
397
                if ( roundingMode == float_round_up ) roundIncrement = 0;
398
            }
399
            else {
400
                if ( roundingMode == float_round_down ) roundIncrement = 0;
401
            }
402
        }
403
    }
404
    roundBits = zSig & 0x3FF;
405
    if ( 0x7FD <= (bits16) zExp ) {
406
        if (    ( 0x7FD < zExp )
407
             || (    ( zExp == 0x7FD )
408
                  && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
409
           ) {
410
            //register int lr = __builtin_return_address(0);
411
            //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
412
            float_raise( float_flag_overflow | float_flag_inexact );
413
            return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
414
        }
415
        if ( zExp < 0 ) {
416
            isTiny =
417
                   ( float_detect_tininess == float_tininess_before_rounding )
418
                || ( zExp < -1 )
419
                || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
420
            shift64RightJamming( zSig, - zExp, &zSig );
421
            zExp = 0;
422
            roundBits = zSig & 0x3FF;
423
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
424
        }
425
    }
426
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
427
    zSig = ( zSig + roundIncrement )>>10;
428
    zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
429
    if ( zSig == 0 ) zExp = 0;
430
    return packFloat64( zSign, zExp, zSig );
431

    
432
}
433

    
434
/*
435
-------------------------------------------------------------------------------
436
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
437
and significand `zSig', and returns the proper double-precision floating-
438
point value corresponding to the abstract input.  This routine is just like
439
`roundAndPackFloat64' except that `zSig' does not have to be normalized in
440
any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
441
point exponent.
442
-------------------------------------------------------------------------------
443
*/
444
static float64
445
 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
446
{
447
    int8 shiftCount;
448

    
449
    shiftCount = countLeadingZeros64( zSig ) - 1;
450
    return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
451

    
452
}
453

    
454
#ifdef FLOATX80
455

    
456
/*
457
-------------------------------------------------------------------------------
458
Returns the fraction bits of the extended double-precision floating-point
459
value `a'.
460
-------------------------------------------------------------------------------
461
*/
462
INLINE bits64 extractFloatx80Frac( floatx80 a )
463
{
464

    
465
    return a.low;
466

    
467
}
468

    
469
/*
470
-------------------------------------------------------------------------------
471
Returns the exponent bits of the extended double-precision floating-point
472
value `a'.
473
-------------------------------------------------------------------------------
474
*/
475
INLINE int32 extractFloatx80Exp( floatx80 a )
476
{
477

    
478
    return a.high & 0x7FFF;
479

    
480
}
481

    
482
/*
483
-------------------------------------------------------------------------------
484
Returns the sign bit of the extended double-precision floating-point value
485
`a'.
486
-------------------------------------------------------------------------------
487
*/
488
INLINE flag extractFloatx80Sign( floatx80 a )
489
{
490

    
491
    return a.high>>15;
492

    
493
}
494

    
495
/*
496
-------------------------------------------------------------------------------
497
Normalizes the subnormal extended double-precision floating-point value
498
represented by the denormalized significand `aSig'.  The normalized exponent
499
and significand are stored at the locations pointed to by `zExpPtr' and
500
`zSigPtr', respectively.
501
-------------------------------------------------------------------------------
502
*/
503
static void
504
 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
505
{
506
    int8 shiftCount;
507

    
508
    shiftCount = countLeadingZeros64( aSig );
509
    *zSigPtr = aSig<<shiftCount;
510
    *zExpPtr = 1 - shiftCount;
511

    
512
}
513

    
514
/*
515
-------------------------------------------------------------------------------
516
Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
517
extended double-precision floating-point value, returning the result.
518
-------------------------------------------------------------------------------
519
*/
520
INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
521
{
522
    floatx80 z;
523

    
524
    z.low = zSig;
525
    z.high = ( ( (bits16) zSign )<<15 ) + zExp;
526
    return z;
527

    
528
}
529

    
530
/*
531
-------------------------------------------------------------------------------
532
Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533
and extended significand formed by the concatenation of `zSig0' and `zSig1',
534
and returns the proper extended double-precision floating-point value
535
corresponding to the abstract input.  Ordinarily, the abstract value is
536
rounded and packed into the extended double-precision format, with the
537
inexact exception raised if the abstract input cannot be represented
538
exactly.  If the abstract value is too large, however, the overflow and
539
inexact exceptions are raised and an infinity or maximal finite value is
540
returned.  If the abstract value is too small, the input value is rounded to
541
a subnormal number, and the underflow and inexact exceptions are raised if
542
the abstract input cannot be represented exactly as a subnormal extended
543
double-precision floating-point number.
544
    If `roundingPrecision' is 32 or 64, the result is rounded to the same
545
number of bits as single or double precision, respectively.  Otherwise, the
546
result is rounded to the full precision of the extended double-precision
547
format.
548
    The input significand must be normalized or smaller.  If the input
549
significand is not normalized, `zExp' must be 0; in that case, the result
550
returned is a subnormal number, and it must not require rounding.  The
551
handling of underflow and overflow follows the IEC/IEEE Standard for Binary
552
Floating-point Arithmetic.
553
-------------------------------------------------------------------------------
554
*/
555
static floatx80
556
 roundAndPackFloatx80(
557
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
558
 )
559
{
560
    int8 roundingMode;
561
    flag roundNearestEven, increment, isTiny;
562
    int64 roundIncrement, roundMask, roundBits;
563

    
564
    roundingMode = float_rounding_mode;
565
    roundNearestEven = ( roundingMode == float_round_nearest_even );
566
    if ( roundingPrecision == 80 ) goto precision80;
567
    if ( roundingPrecision == 64 ) {
568
        roundIncrement = LIT64( 0x0000000000000400 );
569
        roundMask = LIT64( 0x00000000000007FF );
570
    }
571
    else if ( roundingPrecision == 32 ) {
572
        roundIncrement = LIT64( 0x0000008000000000 );
573
        roundMask = LIT64( 0x000000FFFFFFFFFF );
574
    }
575
    else {
576
        goto precision80;
577
    }
578
    zSig0 |= ( zSig1 != 0 );
579
    if ( ! roundNearestEven ) {
580
        if ( roundingMode == float_round_to_zero ) {
581
            roundIncrement = 0;
582
        }
583
        else {
584
            roundIncrement = roundMask;
585
            if ( zSign ) {
586
                if ( roundingMode == float_round_up ) roundIncrement = 0;
587
            }
588
            else {
589
                if ( roundingMode == float_round_down ) roundIncrement = 0;
590
            }
591
        }
592
    }
593
    roundBits = zSig0 & roundMask;
594
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
595
        if (    ( 0x7FFE < zExp )
596
             || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
597
           ) {
598
            goto overflow;
599
        }
600
        if ( zExp <= 0 ) {
601
            isTiny =
602
                   ( float_detect_tininess == float_tininess_before_rounding )
603
                || ( zExp < 0 )
604
                || ( zSig0 <= zSig0 + roundIncrement );
605
            shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
606
            zExp = 0;
607
            roundBits = zSig0 & roundMask;
608
            if ( isTiny && roundBits ) float_raise( float_flag_underflow );
609
            if ( roundBits ) float_exception_flags |= float_flag_inexact;
610
            zSig0 += roundIncrement;
611
            if ( (sbits64) zSig0 < 0 ) zExp = 1;
612
            roundIncrement = roundMask + 1;
613
            if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
614
                roundMask |= roundIncrement;
615
            }
616
            zSig0 &= ~ roundMask;
617
            return packFloatx80( zSign, zExp, zSig0 );
618
        }
619
    }
620
    if ( roundBits ) float_exception_flags |= float_flag_inexact;
621
    zSig0 += roundIncrement;
622
    if ( zSig0 < roundIncrement ) {
623
        ++zExp;
624
        zSig0 = LIT64( 0x8000000000000000 );
625
    }
626
    roundIncrement = roundMask + 1;
627
    if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
628
        roundMask |= roundIncrement;
629
    }
630
    zSig0 &= ~ roundMask;
631
    if ( zSig0 == 0 ) zExp = 0;
632
    return packFloatx80( zSign, zExp, zSig0 );
633
 precision80:
634
    increment = ( (sbits64) zSig1 < 0 );
635
    if ( ! roundNearestEven ) {
636
        if ( roundingMode == float_round_to_zero ) {
637
            increment = 0;
638
        }
639
        else {
640
            if ( zSign ) {
641
                increment = ( roundingMode == float_round_down ) && zSig1;
642
            }
643
            else {
644
                increment = ( roundingMode == float_round_up ) && zSig1;
645
            }
646
        }
647
    }
648
    if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
649
        if (    ( 0x7FFE < zExp )
650
             || (    ( zExp == 0x7FFE )
651
                  && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
652
                  && increment
653
                )
654
           ) {
655
            roundMask = 0;
656
 overflow:
657
            float_raise( float_flag_overflow | float_flag_inexact );
658
            if (    ( roundingMode == float_round_to_zero )
659
                 || ( zSign && ( roundingMode == float_round_up ) )
660
                 || ( ! zSign && ( roundingMode == float_round_down ) )
661
               ) {
662
                return packFloatx80( zSign, 0x7FFE, ~ roundMask );
663
            }
664
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
665
        }
666
        if ( zExp <= 0 ) {
667
            isTiny =
668
                   ( float_detect_tininess == float_tininess_before_rounding )
669
                || ( zExp < 0 )
670
                || ! increment
671
                || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
672
            shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
673
            zExp = 0;
674
            if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
675
            if ( zSig1 ) float_exception_flags |= float_flag_inexact;
676
            if ( roundNearestEven ) {
677
                increment = ( (sbits64) zSig1 < 0 );
678
            }
679
            else {
680
                if ( zSign ) {
681
                    increment = ( roundingMode == float_round_down ) && zSig1;
682
                }
683
                else {
684
                    increment = ( roundingMode == float_round_up ) && zSig1;
685
                }
686
            }
687
            if ( increment ) {
688
                ++zSig0;
689
                zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
690
                if ( (sbits64) zSig0 < 0 ) zExp = 1;
691
            }
692
            return packFloatx80( zSign, zExp, zSig0 );
693
        }
694
    }
695
    if ( zSig1 ) float_exception_flags |= float_flag_inexact;
696
    if ( increment ) {
697
        ++zSig0;
698
        if ( zSig0 == 0 ) {
699
            ++zExp;
700
            zSig0 = LIT64( 0x8000000000000000 );
701
        }
702
        else {
703
            zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
704
        }
705
    }
706
    else {
707
        if ( zSig0 == 0 ) zExp = 0;
708
    }
709
    
710
    return packFloatx80( zSign, zExp, zSig0 );
711
}
712

    
713
/*
714
-------------------------------------------------------------------------------
715
Takes an abstract floating-point value having sign `zSign', exponent
716
`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
717
and returns the proper extended double-precision floating-point value
718
corresponding to the abstract input.  This routine is just like
719
`roundAndPackFloatx80' except that the input significand does not have to be
720
normalized.
721
-------------------------------------------------------------------------------
722
*/
723
static floatx80
724
 normalizeRoundAndPackFloatx80(
725
     int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
726
 )
727
{
728
    int8 shiftCount;
729

    
730
    if ( zSig0 == 0 ) {
731
        zSig0 = zSig1;
732
        zSig1 = 0;
733
        zExp -= 64;
734
    }
735
    shiftCount = countLeadingZeros64( zSig0 );
736
    shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
737
    zExp -= shiftCount;
738
    return
739
        roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
740

    
741
}
742

    
743
#endif
744

    
745
/*
746
-------------------------------------------------------------------------------
747
Returns the result of converting the 32-bit two's complement integer `a' to
748
the single-precision floating-point format.  The conversion is performed
749
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
750
-------------------------------------------------------------------------------
751
*/
752
float32 int32_to_float32( int32 a )
753
{
754
    flag zSign;
755

    
756
    if ( a == 0 ) return 0;
757
    if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
758
    zSign = ( a < 0 );
759
    return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
760

    
761
}
762

    
763
/*
764
-------------------------------------------------------------------------------
765
Returns the result of converting the 32-bit two's complement integer `a' to
766
the double-precision floating-point format.  The conversion is performed
767
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
768
-------------------------------------------------------------------------------
769
*/
770
float64 int32_to_float64( int32 a )
771
{
772
    flag aSign;
773
    uint32 absA;
774
    int8 shiftCount;
775
    bits64 zSig;
776

    
777
    if ( a == 0 ) return 0;
778
    aSign = ( a < 0 );
779
    absA = aSign ? - a : a;
780
    shiftCount = countLeadingZeros32( absA ) + 21;
781
    zSig = absA;
782
    return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
783

    
784
}
785

    
786
#ifdef FLOATX80
787

    
788
/*
789
-------------------------------------------------------------------------------
790
Returns the result of converting the 32-bit two's complement integer `a'
791
to the extended double-precision floating-point format.  The conversion
792
is performed according to the IEC/IEEE Standard for Binary Floating-point
793
Arithmetic.
794
-------------------------------------------------------------------------------
795
*/
796
floatx80 int32_to_floatx80( int32 a )
797
{
798
    flag zSign;
799
    uint32 absA;
800
    int8 shiftCount;
801
    bits64 zSig;
802

    
803
    if ( a == 0 ) return packFloatx80( 0, 0, 0 );
804
    zSign = ( a < 0 );
805
    absA = zSign ? - a : a;
806
    shiftCount = countLeadingZeros32( absA ) + 32;
807
    zSig = absA;
808
    return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
809

    
810
}
811

    
812
#endif
813

    
814
/*
815
-------------------------------------------------------------------------------
816
Returns the result of converting the single-precision floating-point value
817
`a' to the 32-bit two's complement integer format.  The conversion is
818
performed according to the IEC/IEEE Standard for Binary Floating-point
819
Arithmetic---which means in particular that the conversion is rounded
820
according to the current rounding mode.  If `a' is a NaN, the largest
821
positive integer is returned.  Otherwise, if the conversion overflows, the
822
largest integer with the same sign as `a' is returned.
823
-------------------------------------------------------------------------------
824
*/
825
int32 float32_to_int32( float32 a )
826
{
827
    flag aSign;
828
    int16 aExp, shiftCount;
829
    bits32 aSig;
830
    bits64 zSig;
831

    
832
    aSig = extractFloat32Frac( a );
833
    aExp = extractFloat32Exp( a );
834
    aSign = extractFloat32Sign( a );
835
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
836
    if ( aExp ) aSig |= 0x00800000;
837
    shiftCount = 0xAF - aExp;
838
    zSig = aSig;
839
    zSig <<= 32;
840
    if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
841
    return roundAndPackInt32( aSign, zSig );
842

    
843
}
844

    
845
/*
846
-------------------------------------------------------------------------------
847
Returns the result of converting the single-precision floating-point value
848
`a' to the 32-bit two's complement integer format.  The conversion is
849
performed according to the IEC/IEEE Standard for Binary Floating-point
850
Arithmetic, except that the conversion is always rounded toward zero.  If
851
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
852
conversion overflows, the largest integer with the same sign as `a' is
853
returned.
854
-------------------------------------------------------------------------------
855
*/
856
int32 float32_to_int32_round_to_zero( float32 a )
857
{
858
    flag aSign;
859
    int16 aExp, shiftCount;
860
    bits32 aSig;
861
    int32 z;
862

    
863
    aSig = extractFloat32Frac( a );
864
    aExp = extractFloat32Exp( a );
865
    aSign = extractFloat32Sign( a );
866
    shiftCount = aExp - 0x9E;
867
    if ( 0 <= shiftCount ) {
868
        if ( a == 0xCF000000 ) return 0x80000000;
869
        float_raise( float_flag_invalid );
870
        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
871
        return 0x80000000;
872
    }
873
    else if ( aExp <= 0x7E ) {
874
        if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
875
        return 0;
876
    }
877
    aSig = ( aSig | 0x00800000 )<<8;
878
    z = aSig>>( - shiftCount );
879
    if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
880
        float_exception_flags |= float_flag_inexact;
881
    }
882
    return aSign ? - z : z;
883

    
884
}
885

    
886
/*
887
-------------------------------------------------------------------------------
888
Returns the result of converting the single-precision floating-point value
889
`a' to the double-precision floating-point format.  The conversion is
890
performed according to the IEC/IEEE Standard for Binary Floating-point
891
Arithmetic.
892
-------------------------------------------------------------------------------
893
*/
894
float64 float32_to_float64( float32 a )
895
{
896
    flag aSign;
897
    int16 aExp;
898
    bits32 aSig;
899

    
900
    aSig = extractFloat32Frac( a );
901
    aExp = extractFloat32Exp( a );
902
    aSign = extractFloat32Sign( a );
903
    if ( aExp == 0xFF ) {
904
        if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
905
        return packFloat64( aSign, 0x7FF, 0 );
906
    }
907
    if ( aExp == 0 ) {
908
        if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
909
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
910
        --aExp;
911
    }
912
    return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
913

    
914
}
915

    
916
#ifdef FLOATX80
917

    
918
/*
919
-------------------------------------------------------------------------------
920
Returns the result of converting the single-precision floating-point value
921
`a' to the extended double-precision floating-point format.  The conversion
922
is performed according to the IEC/IEEE Standard for Binary Floating-point
923
Arithmetic.
924
-------------------------------------------------------------------------------
925
*/
926
floatx80 float32_to_floatx80( float32 a )
927
{
928
    flag aSign;
929
    int16 aExp;
930
    bits32 aSig;
931

    
932
    aSig = extractFloat32Frac( a );
933
    aExp = extractFloat32Exp( a );
934
    aSign = extractFloat32Sign( a );
935
    if ( aExp == 0xFF ) {
936
        if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
937
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
938
    }
939
    if ( aExp == 0 ) {
940
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
941
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
942
    }
943
    aSig |= 0x00800000;
944
    return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
945

    
946
}
947

    
948
#endif
949

    
950
/*
951
-------------------------------------------------------------------------------
952
Rounds the single-precision floating-point value `a' to an integer, and
953
returns the result as a single-precision floating-point value.  The
954
operation is performed according to the IEC/IEEE Standard for Binary
955
Floating-point Arithmetic.
956
-------------------------------------------------------------------------------
957
*/
958
float32 float32_round_to_int( float32 a )
959
{
960
    flag aSign;
961
    int16 aExp;
962
    bits32 lastBitMask, roundBitsMask;
963
    int8 roundingMode;
964
    float32 z;
965

    
966
    aExp = extractFloat32Exp( a );
967
    if ( 0x96 <= aExp ) {
968
        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
969
            return propagateFloat32NaN( a, a );
970
        }
971
        return a;
972
    }
973
    if ( aExp <= 0x7E ) {
974
        if ( (bits32) ( a<<1 ) == 0 ) return a;
975
        float_exception_flags |= float_flag_inexact;
976
        aSign = extractFloat32Sign( a );
977
        switch ( float_rounding_mode ) {
978
         case float_round_nearest_even:
979
            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
980
                return packFloat32( aSign, 0x7F, 0 );
981
            }
982
            break;
983
         case float_round_down:
984
            return aSign ? 0xBF800000 : 0;
985
         case float_round_up:
986
            return aSign ? 0x80000000 : 0x3F800000;
987
        }
988
        return packFloat32( aSign, 0, 0 );
989
    }
990
    lastBitMask = 1;
991
    lastBitMask <<= 0x96 - aExp;
992
    roundBitsMask = lastBitMask - 1;
993
    z = a;
994
    roundingMode = float_rounding_mode;
995
    if ( roundingMode == float_round_nearest_even ) {
996
        z += lastBitMask>>1;
997
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
998
    }
999
    else if ( roundingMode != float_round_to_zero ) {
1000
        if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1001
            z += roundBitsMask;
1002
        }
1003
    }
1004
    z &= ~ roundBitsMask;
1005
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1006
    return z;
1007

    
1008
}
1009

    
1010
/*
1011
-------------------------------------------------------------------------------
1012
Returns the result of adding the absolute values of the single-precision
1013
floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1014
before being returned.  `zSign' is ignored if the result is a NaN.  The
1015
addition is performed according to the IEC/IEEE Standard for Binary
1016
Floating-point Arithmetic.
1017
-------------------------------------------------------------------------------
1018
*/
1019
static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1020
{
1021
    int16 aExp, bExp, zExp;
1022
    bits32 aSig, bSig, zSig;
1023
    int16 expDiff;
1024

    
1025
    aSig = extractFloat32Frac( a );
1026
    aExp = extractFloat32Exp( a );
1027
    bSig = extractFloat32Frac( b );
1028
    bExp = extractFloat32Exp( b );
1029
    expDiff = aExp - bExp;
1030
    aSig <<= 6;
1031
    bSig <<= 6;
1032
    if ( 0 < expDiff ) {
1033
        if ( aExp == 0xFF ) {
1034
            if ( aSig ) return propagateFloat32NaN( a, b );
1035
            return a;
1036
        }
1037
        if ( bExp == 0 ) {
1038
            --expDiff;
1039
        }
1040
        else {
1041
            bSig |= 0x20000000;
1042
        }
1043
        shift32RightJamming( bSig, expDiff, &bSig );
1044
        zExp = aExp;
1045
    }
1046
    else if ( expDiff < 0 ) {
1047
        if ( bExp == 0xFF ) {
1048
            if ( bSig ) return propagateFloat32NaN( a, b );
1049
            return packFloat32( zSign, 0xFF, 0 );
1050
        }
1051
        if ( aExp == 0 ) {
1052
            ++expDiff;
1053
        }
1054
        else {
1055
            aSig |= 0x20000000;
1056
        }
1057
        shift32RightJamming( aSig, - expDiff, &aSig );
1058
        zExp = bExp;
1059
    }
1060
    else {
1061
        if ( aExp == 0xFF ) {
1062
            if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1063
            return a;
1064
        }
1065
        if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1066
        zSig = 0x40000000 + aSig + bSig;
1067
        zExp = aExp;
1068
        goto roundAndPack;
1069
    }
1070
    aSig |= 0x20000000;
1071
    zSig = ( aSig + bSig )<<1;
1072
    --zExp;
1073
    if ( (sbits32) zSig < 0 ) {
1074
        zSig = aSig + bSig;
1075
        ++zExp;
1076
    }
1077
 roundAndPack:
1078
    return roundAndPackFloat32( zSign, zExp, zSig );
1079

    
1080
}
1081

    
1082
/*
1083
-------------------------------------------------------------------------------
1084
Returns the result of subtracting the absolute values of the single-
1085
precision floating-point values `a' and `b'.  If `zSign' is true, the
1086
difference is negated before being returned.  `zSign' is ignored if the
1087
result is a NaN.  The subtraction is performed according to the IEC/IEEE
1088
Standard for Binary Floating-point Arithmetic.
1089
-------------------------------------------------------------------------------
1090
*/
1091
static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1092
{
1093
    int16 aExp, bExp, zExp;
1094
    bits32 aSig, bSig, zSig;
1095
    int16 expDiff;
1096

    
1097
    aSig = extractFloat32Frac( a );
1098
    aExp = extractFloat32Exp( a );
1099
    bSig = extractFloat32Frac( b );
1100
    bExp = extractFloat32Exp( b );
1101
    expDiff = aExp - bExp;
1102
    aSig <<= 7;
1103
    bSig <<= 7;
1104
    if ( 0 < expDiff ) goto aExpBigger;
1105
    if ( expDiff < 0 ) goto bExpBigger;
1106
    if ( aExp == 0xFF ) {
1107
        if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1108
        float_raise( float_flag_invalid );
1109
        return float32_default_nan;
1110
    }
1111
    if ( aExp == 0 ) {
1112
        aExp = 1;
1113
        bExp = 1;
1114
    }
1115
    if ( bSig < aSig ) goto aBigger;
1116
    if ( aSig < bSig ) goto bBigger;
1117
    return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1118
 bExpBigger:
1119
    if ( bExp == 0xFF ) {
1120
        if ( bSig ) return propagateFloat32NaN( a, b );
1121
        return packFloat32( zSign ^ 1, 0xFF, 0 );
1122
    }
1123
    if ( aExp == 0 ) {
1124
        ++expDiff;
1125
    }
1126
    else {
1127
        aSig |= 0x40000000;
1128
    }
1129
    shift32RightJamming( aSig, - expDiff, &aSig );
1130
    bSig |= 0x40000000;
1131
 bBigger:
1132
    zSig = bSig - aSig;
1133
    zExp = bExp;
1134
    zSign ^= 1;
1135
    goto normalizeRoundAndPack;
1136
 aExpBigger:
1137
    if ( aExp == 0xFF ) {
1138
        if ( aSig ) return propagateFloat32NaN( a, b );
1139
        return a;
1140
    }
1141
    if ( bExp == 0 ) {
1142
        --expDiff;
1143
    }
1144
    else {
1145
        bSig |= 0x40000000;
1146
    }
1147
    shift32RightJamming( bSig, expDiff, &bSig );
1148
    aSig |= 0x40000000;
1149
 aBigger:
1150
    zSig = aSig - bSig;
1151
    zExp = aExp;
1152
 normalizeRoundAndPack:
1153
    --zExp;
1154
    return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1155

    
1156
}
1157

    
1158
/*
1159
-------------------------------------------------------------------------------
1160
Returns the result of adding the single-precision floating-point values `a'
1161
and `b'.  The operation is performed according to the IEC/IEEE Standard for
1162
Binary Floating-point Arithmetic.
1163
-------------------------------------------------------------------------------
1164
*/
1165
float32 float32_add( float32 a, float32 b )
1166
{
1167
    flag aSign, bSign;
1168

    
1169
    aSign = extractFloat32Sign( a );
1170
    bSign = extractFloat32Sign( b );
1171
    if ( aSign == bSign ) {
1172
        return addFloat32Sigs( a, b, aSign );
1173
    }
1174
    else {
1175
        return subFloat32Sigs( a, b, aSign );
1176
    }
1177

    
1178
}
1179

    
1180
/*
1181
-------------------------------------------------------------------------------
1182
Returns the result of subtracting the single-precision floating-point values
1183
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1184
for Binary Floating-point Arithmetic.
1185
-------------------------------------------------------------------------------
1186
*/
1187
float32 float32_sub( float32 a, float32 b )
1188
{
1189
    flag aSign, bSign;
1190

    
1191
    aSign = extractFloat32Sign( a );
1192
    bSign = extractFloat32Sign( b );
1193
    if ( aSign == bSign ) {
1194
        return subFloat32Sigs( a, b, aSign );
1195
    }
1196
    else {
1197
        return addFloat32Sigs( a, b, aSign );
1198
    }
1199

    
1200
}
1201

    
1202
/*
1203
-------------------------------------------------------------------------------
1204
Returns the result of multiplying the single-precision floating-point values
1205
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1206
for Binary Floating-point Arithmetic.
1207
-------------------------------------------------------------------------------
1208
*/
1209
float32 float32_mul( float32 a, float32 b )
1210
{
1211
    flag aSign, bSign, zSign;
1212
    int16 aExp, bExp, zExp;
1213
    bits32 aSig, bSig;
1214
    bits64 zSig64;
1215
    bits32 zSig;
1216

    
1217
    aSig = extractFloat32Frac( a );
1218
    aExp = extractFloat32Exp( a );
1219
    aSign = extractFloat32Sign( a );
1220
    bSig = extractFloat32Frac( b );
1221
    bExp = extractFloat32Exp( b );
1222
    bSign = extractFloat32Sign( b );
1223
    zSign = aSign ^ bSign;
1224
    if ( aExp == 0xFF ) {
1225
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1226
            return propagateFloat32NaN( a, b );
1227
        }
1228
        if ( ( bExp | bSig ) == 0 ) {
1229
            float_raise( float_flag_invalid );
1230
            return float32_default_nan;
1231
        }
1232
        return packFloat32( zSign, 0xFF, 0 );
1233
    }
1234
    if ( bExp == 0xFF ) {
1235
        if ( bSig ) return propagateFloat32NaN( a, b );
1236
        if ( ( aExp | aSig ) == 0 ) {
1237
            float_raise( float_flag_invalid );
1238
            return float32_default_nan;
1239
        }
1240
        return packFloat32( zSign, 0xFF, 0 );
1241
    }
1242
    if ( aExp == 0 ) {
1243
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1244
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1245
    }
1246
    if ( bExp == 0 ) {
1247
        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1248
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1249
    }
1250
    zExp = aExp + bExp - 0x7F;
1251
    aSig = ( aSig | 0x00800000 )<<7;
1252
    bSig = ( bSig | 0x00800000 )<<8;
1253
    shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1254
    zSig = zSig64;
1255
    if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1256
        zSig <<= 1;
1257
        --zExp;
1258
    }
1259
    return roundAndPackFloat32( zSign, zExp, zSig );
1260

    
1261
}
1262

    
1263
/*
1264
-------------------------------------------------------------------------------
1265
Returns the result of dividing the single-precision floating-point value `a'
1266
by the corresponding value `b'.  The operation is performed according to the
1267
IEC/IEEE Standard for Binary Floating-point Arithmetic.
1268
-------------------------------------------------------------------------------
1269
*/
1270
float32 float32_div( float32 a, float32 b )
1271
{
1272
    flag aSign, bSign, zSign;
1273
    int16 aExp, bExp, zExp;
1274
    bits32 aSig, bSig, zSig;
1275

    
1276
    aSig = extractFloat32Frac( a );
1277
    aExp = extractFloat32Exp( a );
1278
    aSign = extractFloat32Sign( a );
1279
    bSig = extractFloat32Frac( b );
1280
    bExp = extractFloat32Exp( b );
1281
    bSign = extractFloat32Sign( b );
1282
    zSign = aSign ^ bSign;
1283
    if ( aExp == 0xFF ) {
1284
        if ( aSig ) return propagateFloat32NaN( a, b );
1285
        if ( bExp == 0xFF ) {
1286
            if ( bSig ) return propagateFloat32NaN( a, b );
1287
            float_raise( float_flag_invalid );
1288
            return float32_default_nan;
1289
        }
1290
        return packFloat32( zSign, 0xFF, 0 );
1291
    }
1292
    if ( bExp == 0xFF ) {
1293
        if ( bSig ) return propagateFloat32NaN( a, b );
1294
        return packFloat32( zSign, 0, 0 );
1295
    }
1296
    if ( bExp == 0 ) {
1297
        if ( bSig == 0 ) {
1298
            if ( ( aExp | aSig ) == 0 ) {
1299
                float_raise( float_flag_invalid );
1300
                return float32_default_nan;
1301
            }
1302
            float_raise( float_flag_divbyzero );
1303
            return packFloat32( zSign, 0xFF, 0 );
1304
        }
1305
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1306
    }
1307
    if ( aExp == 0 ) {
1308
        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1309
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1310
    }
1311
    zExp = aExp - bExp + 0x7D;
1312
    aSig = ( aSig | 0x00800000 )<<7;
1313
    bSig = ( bSig | 0x00800000 )<<8;
1314
    if ( bSig <= ( aSig + aSig ) ) {
1315
        aSig >>= 1;
1316
        ++zExp;
1317
    }
1318
    zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1319
    if ( ( zSig & 0x3F ) == 0 ) {
1320
        zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1321
    }
1322
    return roundAndPackFloat32( zSign, zExp, zSig );
1323

    
1324
}
1325

    
1326
/*
1327
-------------------------------------------------------------------------------
1328
Returns the remainder of the single-precision floating-point value `a'
1329
with respect to the corresponding value `b'.  The operation is performed
1330
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1331
-------------------------------------------------------------------------------
1332
*/
1333
float32 float32_rem( float32 a, float32 b )
1334
{
1335
    flag aSign, bSign, zSign;
1336
    int16 aExp, bExp, expDiff;
1337
    bits32 aSig, bSig;
1338
    bits32 q;
1339
    bits64 aSig64, bSig64, q64;
1340
    bits32 alternateASig;
1341
    sbits32 sigMean;
1342

    
1343
    aSig = extractFloat32Frac( a );
1344
    aExp = extractFloat32Exp( a );
1345
    aSign = extractFloat32Sign( a );
1346
    bSig = extractFloat32Frac( b );
1347
    bExp = extractFloat32Exp( b );
1348
    bSign = extractFloat32Sign( b );
1349
    if ( aExp == 0xFF ) {
1350
        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1351
            return propagateFloat32NaN( a, b );
1352
        }
1353
        float_raise( float_flag_invalid );
1354
        return float32_default_nan;
1355
    }
1356
    if ( bExp == 0xFF ) {
1357
        if ( bSig ) return propagateFloat32NaN( a, b );
1358
        return a;
1359
    }
1360
    if ( bExp == 0 ) {
1361
        if ( bSig == 0 ) {
1362
            float_raise( float_flag_invalid );
1363
            return float32_default_nan;
1364
        }
1365
        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1366
    }
1367
    if ( aExp == 0 ) {
1368
        if ( aSig == 0 ) return a;
1369
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1370
    }
1371
    expDiff = aExp - bExp;
1372
    aSig |= 0x00800000;
1373
    bSig |= 0x00800000;
1374
    if ( expDiff < 32 ) {
1375
        aSig <<= 8;
1376
        bSig <<= 8;
1377
        if ( expDiff < 0 ) {
1378
            if ( expDiff < -1 ) return a;
1379
            aSig >>= 1;
1380
        }
1381
        q = ( bSig <= aSig );
1382
        if ( q ) aSig -= bSig;
1383
        if ( 0 < expDiff ) {
1384
            q = ( ( (bits64) aSig )<<32 ) / bSig;
1385
            q >>= 32 - expDiff;
1386
            bSig >>= 2;
1387
            aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1388
        }
1389
        else {
1390
            aSig >>= 2;
1391
            bSig >>= 2;
1392
        }
1393
    }
1394
    else {
1395
        if ( bSig <= aSig ) aSig -= bSig;
1396
        aSig64 = ( (bits64) aSig )<<40;
1397
        bSig64 = ( (bits64) bSig )<<40;
1398
        expDiff -= 64;
1399
        while ( 0 < expDiff ) {
1400
            q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1401
            q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1402
            aSig64 = - ( ( bSig * q64 )<<38 );
1403
            expDiff -= 62;
1404
        }
1405
        expDiff += 64;
1406
        q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1407
        q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1408
        q = q64>>( 64 - expDiff );
1409
        bSig <<= 6;
1410
        aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1411
    }
1412
    do {
1413
        alternateASig = aSig;
1414
        ++q;
1415
        aSig -= bSig;
1416
    } while ( 0 <= (sbits32) aSig );
1417
    sigMean = aSig + alternateASig;
1418
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1419
        aSig = alternateASig;
1420
    }
1421
    zSign = ( (sbits32) aSig < 0 );
1422
    if ( zSign ) aSig = - aSig;
1423
    return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1424

    
1425
}
1426

    
1427
/*
1428
-------------------------------------------------------------------------------
1429
Returns the square root of the single-precision floating-point value `a'.
1430
The operation is performed according to the IEC/IEEE Standard for Binary
1431
Floating-point Arithmetic.
1432
-------------------------------------------------------------------------------
1433
*/
1434
float32 float32_sqrt( float32 a )
1435
{
1436
    flag aSign;
1437
    int16 aExp, zExp;
1438
    bits32 aSig, zSig;
1439
    bits64 rem, term;
1440

    
1441
    aSig = extractFloat32Frac( a );
1442
    aExp = extractFloat32Exp( a );
1443
    aSign = extractFloat32Sign( a );
1444
    if ( aExp == 0xFF ) {
1445
        if ( aSig ) return propagateFloat32NaN( a, 0 );
1446
        if ( ! aSign ) return a;
1447
        float_raise( float_flag_invalid );
1448
        return float32_default_nan;
1449
    }
1450
    if ( aSign ) {
1451
        if ( ( aExp | aSig ) == 0 ) return a;
1452
        float_raise( float_flag_invalid );
1453
        return float32_default_nan;
1454
    }
1455
    if ( aExp == 0 ) {
1456
        if ( aSig == 0 ) return 0;
1457
        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1458
    }
1459
    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1460
    aSig = ( aSig | 0x00800000 )<<8;
1461
    zSig = estimateSqrt32( aExp, aSig ) + 2;
1462
    if ( ( zSig & 0x7F ) <= 5 ) {
1463
        if ( zSig < 2 ) {
1464
            zSig = 0xFFFFFFFF;
1465
        }
1466
        else {
1467
            aSig >>= aExp & 1;
1468
            term = ( (bits64) zSig ) * zSig;
1469
            rem = ( ( (bits64) aSig )<<32 ) - term;
1470
            while ( (sbits64) rem < 0 ) {
1471
                --zSig;
1472
                rem += ( ( (bits64) zSig )<<1 ) | 1;
1473
            }
1474
            zSig |= ( rem != 0 );
1475
        }
1476
    }
1477
    shift32RightJamming( zSig, 1, &zSig );
1478
    return roundAndPackFloat32( 0, zExp, zSig );
1479

    
1480
}
1481

    
1482
/*
1483
-------------------------------------------------------------------------------
1484
Returns 1 if the single-precision floating-point value `a' is equal to the
1485
corresponding value `b', and 0 otherwise.  The comparison is performed
1486
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1487
-------------------------------------------------------------------------------
1488
*/
1489
flag float32_eq( float32 a, float32 b )
1490
{
1491

    
1492
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1493
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1494
       ) {
1495
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1496
            float_raise( float_flag_invalid );
1497
        }
1498
        return 0;
1499
    }
1500
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1501

    
1502
}
1503

    
1504
/*
1505
-------------------------------------------------------------------------------
1506
Returns 1 if the single-precision floating-point value `a' is less than or
1507
equal to the corresponding value `b', and 0 otherwise.  The comparison is
1508
performed according to the IEC/IEEE Standard for Binary Floating-point
1509
Arithmetic.
1510
-------------------------------------------------------------------------------
1511
*/
1512
flag float32_le( float32 a, float32 b )
1513
{
1514
    flag aSign, bSign;
1515

    
1516
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1517
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1518
       ) {
1519
        float_raise( float_flag_invalid );
1520
        return 0;
1521
    }
1522
    aSign = extractFloat32Sign( a );
1523
    bSign = extractFloat32Sign( b );
1524
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1525
    return ( a == b ) || ( aSign ^ ( a < b ) );
1526

    
1527
}
1528

    
1529
/*
1530
-------------------------------------------------------------------------------
1531
Returns 1 if the single-precision floating-point value `a' is less than
1532
the corresponding value `b', and 0 otherwise.  The comparison is performed
1533
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1534
-------------------------------------------------------------------------------
1535
*/
1536
flag float32_lt( float32 a, float32 b )
1537
{
1538
    flag aSign, bSign;
1539

    
1540
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1541
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1542
       ) {
1543
        float_raise( float_flag_invalid );
1544
        return 0;
1545
    }
1546
    aSign = extractFloat32Sign( a );
1547
    bSign = extractFloat32Sign( b );
1548
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1549
    return ( a != b ) && ( aSign ^ ( a < b ) );
1550

    
1551
}
1552

    
1553
/*
1554
-------------------------------------------------------------------------------
1555
Returns 1 if the single-precision floating-point value `a' is equal to the
1556
corresponding value `b', and 0 otherwise.  The invalid exception is raised
1557
if either operand is a NaN.  Otherwise, the comparison is performed
1558
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1559
-------------------------------------------------------------------------------
1560
*/
1561
flag float32_eq_signaling( float32 a, float32 b )
1562
{
1563

    
1564
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1565
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1566
       ) {
1567
        float_raise( float_flag_invalid );
1568
        return 0;
1569
    }
1570
    return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1571

    
1572
}
1573

    
1574
/*
1575
-------------------------------------------------------------------------------
1576
Returns 1 if the single-precision floating-point value `a' is less than or
1577
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1578
cause an exception.  Otherwise, the comparison is performed according to the
1579
IEC/IEEE Standard for Binary Floating-point Arithmetic.
1580
-------------------------------------------------------------------------------
1581
*/
1582
flag float32_le_quiet( float32 a, float32 b )
1583
{
1584
    flag aSign, bSign;
1585
    //int16 aExp, bExp;
1586

    
1587
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1588
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1589
       ) {
1590
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1591
            float_raise( float_flag_invalid );
1592
        }
1593
        return 0;
1594
    }
1595
    aSign = extractFloat32Sign( a );
1596
    bSign = extractFloat32Sign( b );
1597
    if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1598
    return ( a == b ) || ( aSign ^ ( a < b ) );
1599

    
1600
}
1601

    
1602
/*
1603
-------------------------------------------------------------------------------
1604
Returns 1 if the single-precision floating-point value `a' is less than
1605
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1606
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1607
Standard for Binary Floating-point Arithmetic.
1608
-------------------------------------------------------------------------------
1609
*/
1610
flag float32_lt_quiet( float32 a, float32 b )
1611
{
1612
    flag aSign, bSign;
1613

    
1614
    if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1615
         || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1616
       ) {
1617
        if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1618
            float_raise( float_flag_invalid );
1619
        }
1620
        return 0;
1621
    }
1622
    aSign = extractFloat32Sign( a );
1623
    bSign = extractFloat32Sign( b );
1624
    if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1625
    return ( a != b ) && ( aSign ^ ( a < b ) );
1626

    
1627
}
1628

    
1629
/*
1630
-------------------------------------------------------------------------------
1631
Returns the result of converting the double-precision floating-point value
1632
`a' to the 32-bit two's complement integer format.  The conversion is
1633
performed according to the IEC/IEEE Standard for Binary Floating-point
1634
Arithmetic---which means in particular that the conversion is rounded
1635
according to the current rounding mode.  If `a' is a NaN, the largest
1636
positive integer is returned.  Otherwise, if the conversion overflows, the
1637
largest integer with the same sign as `a' is returned.
1638
-------------------------------------------------------------------------------
1639
*/
1640
int32 float64_to_int32( float64 a )
1641
{
1642
    flag aSign;
1643
    int16 aExp, shiftCount;
1644
    bits64 aSig;
1645

    
1646
    aSig = extractFloat64Frac( a );
1647
    aExp = extractFloat64Exp( a );
1648
    aSign = extractFloat64Sign( a );
1649
    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1650
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1651
    shiftCount = 0x42C - aExp;
1652
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1653
    return roundAndPackInt32( aSign, aSig );
1654

    
1655
}
1656

    
1657
/*
1658
-------------------------------------------------------------------------------
1659
Returns the result of converting the double-precision floating-point value
1660
`a' to the 32-bit two's complement integer format.  The conversion is
1661
performed according to the IEC/IEEE Standard for Binary Floating-point
1662
Arithmetic, except that the conversion is always rounded toward zero.  If
1663
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1664
conversion overflows, the largest integer with the same sign as `a' is
1665
returned.
1666
-------------------------------------------------------------------------------
1667
*/
1668
int32 float64_to_int32_round_to_zero( float64 a )
1669
{
1670
    flag aSign;
1671
    int16 aExp, shiftCount;
1672
    bits64 aSig, savedASig;
1673
    int32 z;
1674

    
1675
    aSig = extractFloat64Frac( a );
1676
    aExp = extractFloat64Exp( a );
1677
    aSign = extractFloat64Sign( a );
1678
    shiftCount = 0x433 - aExp;
1679
    if ( shiftCount < 21 ) {
1680
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1681
        goto invalid;
1682
    }
1683
    else if ( 52 < shiftCount ) {
1684
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1685
        return 0;
1686
    }
1687
    aSig |= LIT64( 0x0010000000000000 );
1688
    savedASig = aSig;
1689
    aSig >>= shiftCount;
1690
    z = aSig;
1691
    if ( aSign ) z = - z;
1692
    if ( ( z < 0 ) ^ aSign ) {
1693
 invalid:
1694
        float_exception_flags |= float_flag_invalid;
1695
        return aSign ? 0x80000000 : 0x7FFFFFFF;
1696
    }
1697
    if ( ( aSig<<shiftCount ) != savedASig ) {
1698
        float_exception_flags |= float_flag_inexact;
1699
    }
1700
    return z;
1701

    
1702
}
1703

    
1704
/*
1705
-------------------------------------------------------------------------------
1706
Returns the result of converting the double-precision floating-point value
1707
`a' to the 32-bit two's complement unsigned integer format.  The conversion
1708
is performed according to the IEC/IEEE Standard for Binary Floating-point
1709
Arithmetic---which means in particular that the conversion is rounded
1710
according to the current rounding mode.  If `a' is a NaN, the largest
1711
positive integer is returned.  Otherwise, if the conversion overflows, the
1712
largest positive integer is returned.
1713
-------------------------------------------------------------------------------
1714
*/
1715
int32 float64_to_uint32( float64 a )
1716
{
1717
    flag aSign;
1718
    int16 aExp, shiftCount;
1719
    bits64 aSig;
1720

    
1721
    aSig = extractFloat64Frac( a );
1722
    aExp = extractFloat64Exp( a );
1723
    aSign = 0; //extractFloat64Sign( a );
1724
    //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1725
    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1726
    shiftCount = 0x42C - aExp;
1727
    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1728
    return roundAndPackInt32( aSign, aSig );
1729
}
1730

    
1731
/*
1732
-------------------------------------------------------------------------------
1733
Returns the result of converting the double-precision floating-point value
1734
`a' to the 32-bit two's complement integer format.  The conversion is
1735
performed according to the IEC/IEEE Standard for Binary Floating-point
1736
Arithmetic, except that the conversion is always rounded toward zero.  If
1737
`a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1738
conversion overflows, the largest positive integer is returned.
1739
-------------------------------------------------------------------------------
1740
*/
1741
int32 float64_to_uint32_round_to_zero( float64 a )
1742
{
1743
    flag aSign;
1744
    int16 aExp, shiftCount;
1745
    bits64 aSig, savedASig;
1746
    int32 z;
1747

    
1748
    aSig = extractFloat64Frac( a );
1749
    aExp = extractFloat64Exp( a );
1750
    aSign = extractFloat64Sign( a );
1751
    shiftCount = 0x433 - aExp;
1752
    if ( shiftCount < 21 ) {
1753
        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1754
        goto invalid;
1755
    }
1756
    else if ( 52 < shiftCount ) {
1757
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1758
        return 0;
1759
    }
1760
    aSig |= LIT64( 0x0010000000000000 );
1761
    savedASig = aSig;
1762
    aSig >>= shiftCount;
1763
    z = aSig;
1764
    if ( aSign ) z = - z;
1765
    if ( ( z < 0 ) ^ aSign ) {
1766
 invalid:
1767
        float_exception_flags |= float_flag_invalid;
1768
        return aSign ? 0x80000000 : 0x7FFFFFFF;
1769
    }
1770
    if ( ( aSig<<shiftCount ) != savedASig ) {
1771
        float_exception_flags |= float_flag_inexact;
1772
    }
1773
    return z;
1774
}
1775

    
1776
/*
1777
-------------------------------------------------------------------------------
1778
Returns the result of converting the double-precision floating-point value
1779
`a' to the single-precision floating-point format.  The conversion is
1780
performed according to the IEC/IEEE Standard for Binary Floating-point
1781
Arithmetic.
1782
-------------------------------------------------------------------------------
1783
*/
1784
float32 float64_to_float32( float64 a )
1785
{
1786
    flag aSign;
1787
    int16 aExp;
1788
    bits64 aSig;
1789
    bits32 zSig;
1790

    
1791
    aSig = extractFloat64Frac( a );
1792
    aExp = extractFloat64Exp( a );
1793
    aSign = extractFloat64Sign( a );
1794
    if ( aExp == 0x7FF ) {
1795
        if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1796
        return packFloat32( aSign, 0xFF, 0 );
1797
    }
1798
    shift64RightJamming( aSig, 22, &aSig );
1799
    zSig = aSig;
1800
    if ( aExp || zSig ) {
1801
        zSig |= 0x40000000;
1802
        aExp -= 0x381;
1803
    }
1804
    return roundAndPackFloat32( aSign, aExp, zSig );
1805

    
1806
}
1807

    
1808
#ifdef FLOATX80
1809

    
1810
/*
1811
-------------------------------------------------------------------------------
1812
Returns the result of converting the double-precision floating-point value
1813
`a' to the extended double-precision floating-point format.  The conversion
1814
is performed according to the IEC/IEEE Standard for Binary Floating-point
1815
Arithmetic.
1816
-------------------------------------------------------------------------------
1817
*/
1818
floatx80 float64_to_floatx80( float64 a )
1819
{
1820
    flag aSign;
1821
    int16 aExp;
1822
    bits64 aSig;
1823

    
1824
    aSig = extractFloat64Frac( a );
1825
    aExp = extractFloat64Exp( a );
1826
    aSign = extractFloat64Sign( a );
1827
    if ( aExp == 0x7FF ) {
1828
        if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1829
        return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1830
    }
1831
    if ( aExp == 0 ) {
1832
        if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1833
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1834
    }
1835
    return
1836
        packFloatx80(
1837
            aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1838

    
1839
}
1840

    
1841
#endif
1842

    
1843
/*
1844
-------------------------------------------------------------------------------
1845
Rounds the double-precision floating-point value `a' to an integer, and
1846
returns the result as a double-precision floating-point value.  The
1847
operation is performed according to the IEC/IEEE Standard for Binary
1848
Floating-point Arithmetic.
1849
-------------------------------------------------------------------------------
1850
*/
1851
float64 float64_round_to_int( float64 a )
1852
{
1853
    flag aSign;
1854
    int16 aExp;
1855
    bits64 lastBitMask, roundBitsMask;
1856
    int8 roundingMode;
1857
    float64 z;
1858

    
1859
    aExp = extractFloat64Exp( a );
1860
    if ( 0x433 <= aExp ) {
1861
        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1862
            return propagateFloat64NaN( a, a );
1863
        }
1864
        return a;
1865
    }
1866
    if ( aExp <= 0x3FE ) {
1867
        if ( (bits64) ( a<<1 ) == 0 ) return a;
1868
        float_exception_flags |= float_flag_inexact;
1869
        aSign = extractFloat64Sign( a );
1870
        switch ( float_rounding_mode ) {
1871
         case float_round_nearest_even:
1872
            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1873
                return packFloat64( aSign, 0x3FF, 0 );
1874
            }
1875
            break;
1876
         case float_round_down:
1877
            return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1878
         case float_round_up:
1879
            return
1880
            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1881
        }
1882
        return packFloat64( aSign, 0, 0 );
1883
    }
1884
    lastBitMask = 1;
1885
    lastBitMask <<= 0x433 - aExp;
1886
    roundBitsMask = lastBitMask - 1;
1887
    z = a;
1888
    roundingMode = float_rounding_mode;
1889
    if ( roundingMode == float_round_nearest_even ) {
1890
        z += lastBitMask>>1;
1891
        if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1892
    }
1893
    else if ( roundingMode != float_round_to_zero ) {
1894
        if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1895
            z += roundBitsMask;
1896
        }
1897
    }
1898
    z &= ~ roundBitsMask;
1899
    if ( z != a ) float_exception_flags |= float_flag_inexact;
1900
    return z;
1901

    
1902
}
1903

    
1904
/*
1905
-------------------------------------------------------------------------------
1906
Returns the result of adding the absolute values of the double-precision
1907
floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1908
before being returned.  `zSign' is ignored if the result is a NaN.  The
1909
addition is performed according to the IEC/IEEE Standard for Binary
1910
Floating-point Arithmetic.
1911
-------------------------------------------------------------------------------
1912
*/
1913
static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1914
{
1915
    int16 aExp, bExp, zExp;
1916
    bits64 aSig, bSig, zSig;
1917
    int16 expDiff;
1918

    
1919
    aSig = extractFloat64Frac( a );
1920
    aExp = extractFloat64Exp( a );
1921
    bSig = extractFloat64Frac( b );
1922
    bExp = extractFloat64Exp( b );
1923
    expDiff = aExp - bExp;
1924
    aSig <<= 9;
1925
    bSig <<= 9;
1926
    if ( 0 < expDiff ) {
1927
        if ( aExp == 0x7FF ) {
1928
            if ( aSig ) return propagateFloat64NaN( a, b );
1929
            return a;
1930
        }
1931
        if ( bExp == 0 ) {
1932
            --expDiff;
1933
        }
1934
        else {
1935
            bSig |= LIT64( 0x2000000000000000 );
1936
        }
1937
        shift64RightJamming( bSig, expDiff, &bSig );
1938
        zExp = aExp;
1939
    }
1940
    else if ( expDiff < 0 ) {
1941
        if ( bExp == 0x7FF ) {
1942
            if ( bSig ) return propagateFloat64NaN( a, b );
1943
            return packFloat64( zSign, 0x7FF, 0 );
1944
        }
1945
        if ( aExp == 0 ) {
1946
            ++expDiff;
1947
        }
1948
        else {
1949
            aSig |= LIT64( 0x2000000000000000 );
1950
        }
1951
        shift64RightJamming( aSig, - expDiff, &aSig );
1952
        zExp = bExp;
1953
    }
1954
    else {
1955
        if ( aExp == 0x7FF ) {
1956
            if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1957
            return a;
1958
        }
1959
        if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1960
        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1961
        zExp = aExp;
1962
        goto roundAndPack;
1963
    }
1964
    aSig |= LIT64( 0x2000000000000000 );
1965
    zSig = ( aSig + bSig )<<1;
1966
    --zExp;
1967
    if ( (sbits64) zSig < 0 ) {
1968
        zSig = aSig + bSig;
1969
        ++zExp;
1970
    }
1971
 roundAndPack:
1972
    return roundAndPackFloat64( zSign, zExp, zSig );
1973

    
1974
}
1975

    
1976
/*
1977
-------------------------------------------------------------------------------
1978
Returns the result of subtracting the absolute values of the double-
1979
precision floating-point values `a' and `b'.  If `zSign' is true, the
1980
difference is negated before being returned.  `zSign' is ignored if the
1981
result is a NaN.  The subtraction is performed according to the IEC/IEEE
1982
Standard for Binary Floating-point Arithmetic.
1983
-------------------------------------------------------------------------------
1984
*/
1985
static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1986
{
1987
    int16 aExp, bExp, zExp;
1988
    bits64 aSig, bSig, zSig;
1989
    int16 expDiff;
1990

    
1991
    aSig = extractFloat64Frac( a );
1992
    aExp = extractFloat64Exp( a );
1993
    bSig = extractFloat64Frac( b );
1994
    bExp = extractFloat64Exp( b );
1995
    expDiff = aExp - bExp;
1996
    aSig <<= 10;
1997
    bSig <<= 10;
1998
    if ( 0 < expDiff ) goto aExpBigger;
1999
    if ( expDiff < 0 ) goto bExpBigger;
2000
    if ( aExp == 0x7FF ) {
2001
        if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2002
        float_raise( float_flag_invalid );
2003
        return float64_default_nan;
2004
    }
2005
    if ( aExp == 0 ) {
2006
        aExp = 1;
2007
        bExp = 1;
2008
    }
2009
    if ( bSig < aSig ) goto aBigger;
2010
    if ( aSig < bSig ) goto bBigger;
2011
    return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2012
 bExpBigger:
2013
    if ( bExp == 0x7FF ) {
2014
        if ( bSig ) return propagateFloat64NaN( a, b );
2015
        return packFloat64( zSign ^ 1, 0x7FF, 0 );
2016
    }
2017
    if ( aExp == 0 ) {
2018
        ++expDiff;
2019
    }
2020
    else {
2021
        aSig |= LIT64( 0x4000000000000000 );
2022
    }
2023
    shift64RightJamming( aSig, - expDiff, &aSig );
2024
    bSig |= LIT64( 0x4000000000000000 );
2025
 bBigger:
2026
    zSig = bSig - aSig;
2027
    zExp = bExp;
2028
    zSign ^= 1;
2029
    goto normalizeRoundAndPack;
2030
 aExpBigger:
2031
    if ( aExp == 0x7FF ) {
2032
        if ( aSig ) return propagateFloat64NaN( a, b );
2033
        return a;
2034
    }
2035
    if ( bExp == 0 ) {
2036
        --expDiff;
2037
    }
2038
    else {
2039
        bSig |= LIT64( 0x4000000000000000 );
2040
    }
2041
    shift64RightJamming( bSig, expDiff, &bSig );
2042
    aSig |= LIT64( 0x4000000000000000 );
2043
 aBigger:
2044
    zSig = aSig - bSig;
2045
    zExp = aExp;
2046
 normalizeRoundAndPack:
2047
    --zExp;
2048
    return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2049

    
2050
}
2051

    
2052
/*
2053
-------------------------------------------------------------------------------
2054
Returns the result of adding the double-precision floating-point values `a'
2055
and `b'.  The operation is performed according to the IEC/IEEE Standard for
2056
Binary Floating-point Arithmetic.
2057
-------------------------------------------------------------------------------
2058
*/
2059
float64 float64_add( float64 a, float64 b )
2060
{
2061
    flag aSign, bSign;
2062

    
2063
    aSign = extractFloat64Sign( a );
2064
    bSign = extractFloat64Sign( b );
2065
    if ( aSign == bSign ) {
2066
        return addFloat64Sigs( a, b, aSign );
2067
    }
2068
    else {
2069
        return subFloat64Sigs( a, b, aSign );
2070
    }
2071

    
2072
}
2073

    
2074
/*
2075
-------------------------------------------------------------------------------
2076
Returns the result of subtracting the double-precision floating-point values
2077
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2078
for Binary Floating-point Arithmetic.
2079
-------------------------------------------------------------------------------
2080
*/
2081
float64 float64_sub( float64 a, float64 b )
2082
{
2083
    flag aSign, bSign;
2084

    
2085
    aSign = extractFloat64Sign( a );
2086
    bSign = extractFloat64Sign( b );
2087
    if ( aSign == bSign ) {
2088
        return subFloat64Sigs( a, b, aSign );
2089
    }
2090
    else {
2091
        return addFloat64Sigs( a, b, aSign );
2092
    }
2093

    
2094
}
2095

    
2096
/*
2097
-------------------------------------------------------------------------------
2098
Returns the result of multiplying the double-precision floating-point values
2099
`a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2100
for Binary Floating-point Arithmetic.
2101
-------------------------------------------------------------------------------
2102
*/
2103
float64 float64_mul( float64 a, float64 b )
2104
{
2105
    flag aSign, bSign, zSign;
2106
    int16 aExp, bExp, zExp;
2107
    bits64 aSig, bSig, zSig0, zSig1;
2108

    
2109
    aSig = extractFloat64Frac( a );
2110
    aExp = extractFloat64Exp( a );
2111
    aSign = extractFloat64Sign( a );
2112
    bSig = extractFloat64Frac( b );
2113
    bExp = extractFloat64Exp( b );
2114
    bSign = extractFloat64Sign( b );
2115
    zSign = aSign ^ bSign;
2116
    if ( aExp == 0x7FF ) {
2117
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2118
            return propagateFloat64NaN( a, b );
2119
        }
2120
        if ( ( bExp | bSig ) == 0 ) {
2121
            float_raise( float_flag_invalid );
2122
            return float64_default_nan;
2123
        }
2124
        return packFloat64( zSign, 0x7FF, 0 );
2125
    }
2126
    if ( bExp == 0x7FF ) {
2127
        if ( bSig ) return propagateFloat64NaN( a, b );
2128
        if ( ( aExp | aSig ) == 0 ) {
2129
            float_raise( float_flag_invalid );
2130
            return float64_default_nan;
2131
        }
2132
        return packFloat64( zSign, 0x7FF, 0 );
2133
    }
2134
    if ( aExp == 0 ) {
2135
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2136
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2137
    }
2138
    if ( bExp == 0 ) {
2139
        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2140
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2141
    }
2142
    zExp = aExp + bExp - 0x3FF;
2143
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2144
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2145
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2146
    zSig0 |= ( zSig1 != 0 );
2147
    if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2148
        zSig0 <<= 1;
2149
        --zExp;
2150
    }
2151
    return roundAndPackFloat64( zSign, zExp, zSig0 );
2152

    
2153
}
2154

    
2155
/*
2156
-------------------------------------------------------------------------------
2157
Returns the result of dividing the double-precision floating-point value `a'
2158
by the corresponding value `b'.  The operation is performed according to
2159
the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2160
-------------------------------------------------------------------------------
2161
*/
2162
float64 float64_div( float64 a, float64 b )
2163
{
2164
    flag aSign, bSign, zSign;
2165
    int16 aExp, bExp, zExp;
2166
    bits64 aSig, bSig, zSig;
2167
    bits64 rem0, rem1;
2168
    bits64 term0, term1;
2169

    
2170
    aSig = extractFloat64Frac( a );
2171
    aExp = extractFloat64Exp( a );
2172
    aSign = extractFloat64Sign( a );
2173
    bSig = extractFloat64Frac( b );
2174
    bExp = extractFloat64Exp( b );
2175
    bSign = extractFloat64Sign( b );
2176
    zSign = aSign ^ bSign;
2177
    if ( aExp == 0x7FF ) {
2178
        if ( aSig ) return propagateFloat64NaN( a, b );
2179
        if ( bExp == 0x7FF ) {
2180
            if ( bSig ) return propagateFloat64NaN( a, b );
2181
            float_raise( float_flag_invalid );
2182
            return float64_default_nan;
2183
        }
2184
        return packFloat64( zSign, 0x7FF, 0 );
2185
    }
2186
    if ( bExp == 0x7FF ) {
2187
        if ( bSig ) return propagateFloat64NaN( a, b );
2188
        return packFloat64( zSign, 0, 0 );
2189
    }
2190
    if ( bExp == 0 ) {
2191
        if ( bSig == 0 ) {
2192
            if ( ( aExp | aSig ) == 0 ) {
2193
                float_raise( float_flag_invalid );
2194
                return float64_default_nan;
2195
            }
2196
            float_raise( float_flag_divbyzero );
2197
            return packFloat64( zSign, 0x7FF, 0 );
2198
        }
2199
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2200
    }
2201
    if ( aExp == 0 ) {
2202
        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2203
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2204
    }
2205
    zExp = aExp - bExp + 0x3FD;
2206
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2207
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2208
    if ( bSig <= ( aSig + aSig ) ) {
2209
        aSig >>= 1;
2210
        ++zExp;
2211
    }
2212
    zSig = estimateDiv128To64( aSig, 0, bSig );
2213
    if ( ( zSig & 0x1FF ) <= 2 ) {
2214
        mul64To128( bSig, zSig, &term0, &term1 );
2215
        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2216
        while ( (sbits64) rem0 < 0 ) {
2217
            --zSig;
2218
            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2219
        }
2220
        zSig |= ( rem1 != 0 );
2221
    }
2222
    return roundAndPackFloat64( zSign, zExp, zSig );
2223

    
2224
}
2225

    
2226
/*
2227
-------------------------------------------------------------------------------
2228
Returns the remainder of the double-precision floating-point value `a'
2229
with respect to the corresponding value `b'.  The operation is performed
2230
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2231
-------------------------------------------------------------------------------
2232
*/
2233
float64 float64_rem( float64 a, float64 b )
2234
{
2235
    flag aSign, bSign, zSign;
2236
    int16 aExp, bExp, expDiff;
2237
    bits64 aSig, bSig;
2238
    bits64 q, alternateASig;
2239
    sbits64 sigMean;
2240

    
2241
    aSig = extractFloat64Frac( a );
2242
    aExp = extractFloat64Exp( a );
2243
    aSign = extractFloat64Sign( a );
2244
    bSig = extractFloat64Frac( b );
2245
    bExp = extractFloat64Exp( b );
2246
    bSign = extractFloat64Sign( b );
2247
    if ( aExp == 0x7FF ) {
2248
        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2249
            return propagateFloat64NaN( a, b );
2250
        }
2251
        float_raise( float_flag_invalid );
2252
        return float64_default_nan;
2253
    }
2254
    if ( bExp == 0x7FF ) {
2255
        if ( bSig ) return propagateFloat64NaN( a, b );
2256
        return a;
2257
    }
2258
    if ( bExp == 0 ) {
2259
        if ( bSig == 0 ) {
2260
            float_raise( float_flag_invalid );
2261
            return float64_default_nan;
2262
        }
2263
        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2264
    }
2265
    if ( aExp == 0 ) {
2266
        if ( aSig == 0 ) return a;
2267
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2268
    }
2269
    expDiff = aExp - bExp;
2270
    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2271
    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2272
    if ( expDiff < 0 ) {
2273
        if ( expDiff < -1 ) return a;
2274
        aSig >>= 1;
2275
    }
2276
    q = ( bSig <= aSig );
2277
    if ( q ) aSig -= bSig;
2278
    expDiff -= 64;
2279
    while ( 0 < expDiff ) {
2280
        q = estimateDiv128To64( aSig, 0, bSig );
2281
        q = ( 2 < q ) ? q - 2 : 0;
2282
        aSig = - ( ( bSig>>2 ) * q );
2283
        expDiff -= 62;
2284
    }
2285
    expDiff += 64;
2286
    if ( 0 < expDiff ) {
2287
        q = estimateDiv128To64( aSig, 0, bSig );
2288
        q = ( 2 < q ) ? q - 2 : 0;
2289
        q >>= 64 - expDiff;
2290
        bSig >>= 2;
2291
        aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2292
    }
2293
    else {
2294
        aSig >>= 2;
2295
        bSig >>= 2;
2296
    }
2297
    do {
2298
        alternateASig = aSig;
2299
        ++q;
2300
        aSig -= bSig;
2301
    } while ( 0 <= (sbits64) aSig );
2302
    sigMean = aSig + alternateASig;
2303
    if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2304
        aSig = alternateASig;
2305
    }
2306
    zSign = ( (sbits64) aSig < 0 );
2307
    if ( zSign ) aSig = - aSig;
2308
    return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2309

    
2310
}
2311

    
2312
/*
2313
-------------------------------------------------------------------------------
2314
Returns the square root of the double-precision floating-point value `a'.
2315
The operation is performed according to the IEC/IEEE Standard for Binary
2316
Floating-point Arithmetic.
2317
-------------------------------------------------------------------------------
2318
*/
2319
float64 float64_sqrt( float64 a )
2320
{
2321
    flag aSign;
2322
    int16 aExp, zExp;
2323
    bits64 aSig, zSig;
2324
    bits64 rem0, rem1, term0, term1; //, shiftedRem;
2325
    //float64 z;
2326

    
2327
    aSig = extractFloat64Frac( a );
2328
    aExp = extractFloat64Exp( a );
2329
    aSign = extractFloat64Sign( a );
2330
    if ( aExp == 0x7FF ) {
2331
        if ( aSig ) return propagateFloat64NaN( a, a );
2332
        if ( ! aSign ) return a;
2333
        float_raise( float_flag_invalid );
2334
        return float64_default_nan;
2335
    }
2336
    if ( aSign ) {
2337
        if ( ( aExp | aSig ) == 0 ) return a;
2338
        float_raise( float_flag_invalid );
2339
        return float64_default_nan;
2340
    }
2341
    if ( aExp == 0 ) {
2342
        if ( aSig == 0 ) return 0;
2343
        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2344
    }
2345
    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2346
    aSig |= LIT64( 0x0010000000000000 );
2347
    zSig = estimateSqrt32( aExp, aSig>>21 );
2348
    zSig <<= 31;
2349
    aSig <<= 9 - ( aExp & 1 );
2350
    zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2351
    if ( ( zSig & 0x3FF ) <= 5 ) {
2352
        if ( zSig < 2 ) {
2353
            zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2354
        }
2355
        else {
2356
            aSig <<= 2;
2357
            mul64To128( zSig, zSig, &term0, &term1 );
2358
            sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2359
            while ( (sbits64) rem0 < 0 ) {
2360
                --zSig;
2361
                shortShift128Left( 0, zSig, 1, &term0, &term1 );
2362
                term1 |= 1;
2363
                add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2364
            }
2365
            zSig |= ( ( rem0 | rem1 ) != 0 );
2366
        }
2367
    }
2368
    shift64RightJamming( zSig, 1, &zSig );
2369
    return roundAndPackFloat64( 0, zExp, zSig );
2370

    
2371
}
2372

    
2373
/*
2374
-------------------------------------------------------------------------------
2375
Returns 1 if the double-precision floating-point value `a' is equal to the
2376
corresponding value `b', and 0 otherwise.  The comparison is performed
2377
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2378
-------------------------------------------------------------------------------
2379
*/
2380
flag float64_eq( float64 a, float64 b )
2381
{
2382

    
2383
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2384
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2385
       ) {
2386
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2387
            float_raise( float_flag_invalid );
2388
        }
2389
        return 0;
2390
    }
2391
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2392

    
2393
}
2394

    
2395
/*
2396
-------------------------------------------------------------------------------
2397
Returns 1 if the double-precision floating-point value `a' is less than or
2398
equal to the corresponding value `b', and 0 otherwise.  The comparison is
2399
performed according to the IEC/IEEE Standard for Binary Floating-point
2400
Arithmetic.
2401
-------------------------------------------------------------------------------
2402
*/
2403
flag float64_le( float64 a, float64 b )
2404
{
2405
    flag aSign, bSign;
2406

    
2407
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2408
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2409
       ) {
2410
        float_raise( float_flag_invalid );
2411
        return 0;
2412
    }
2413
    aSign = extractFloat64Sign( a );
2414
    bSign = extractFloat64Sign( b );
2415
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2416
    return ( a == b ) || ( aSign ^ ( a < b ) );
2417

    
2418
}
2419

    
2420
/*
2421
-------------------------------------------------------------------------------
2422
Returns 1 if the double-precision floating-point value `a' is less than
2423
the corresponding value `b', and 0 otherwise.  The comparison is performed
2424
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2425
-------------------------------------------------------------------------------
2426
*/
2427
flag float64_lt( float64 a, float64 b )
2428
{
2429
    flag aSign, bSign;
2430

    
2431
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2432
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2433
       ) {
2434
        float_raise( float_flag_invalid );
2435
        return 0;
2436
    }
2437
    aSign = extractFloat64Sign( a );
2438
    bSign = extractFloat64Sign( b );
2439
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2440
    return ( a != b ) && ( aSign ^ ( a < b ) );
2441

    
2442
}
2443

    
2444
/*
2445
-------------------------------------------------------------------------------
2446
Returns 1 if the double-precision floating-point value `a' is equal to the
2447
corresponding value `b', and 0 otherwise.  The invalid exception is raised
2448
if either operand is a NaN.  Otherwise, the comparison is performed
2449
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2450
-------------------------------------------------------------------------------
2451
*/
2452
flag float64_eq_signaling( float64 a, float64 b )
2453
{
2454

    
2455
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2456
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2457
       ) {
2458
        float_raise( float_flag_invalid );
2459
        return 0;
2460
    }
2461
    return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2462

    
2463
}
2464

    
2465
/*
2466
-------------------------------------------------------------------------------
2467
Returns 1 if the double-precision floating-point value `a' is less than or
2468
equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2469
cause an exception.  Otherwise, the comparison is performed according to the
2470
IEC/IEEE Standard for Binary Floating-point Arithmetic.
2471
-------------------------------------------------------------------------------
2472
*/
2473
flag float64_le_quiet( float64 a, float64 b )
2474
{
2475
    flag aSign, bSign;
2476
    //int16 aExp, bExp;
2477

    
2478
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2479
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2480
       ) {
2481
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2482
            float_raise( float_flag_invalid );
2483
        }
2484
        return 0;
2485
    }
2486
    aSign = extractFloat64Sign( a );
2487
    bSign = extractFloat64Sign( b );
2488
    if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2489
    return ( a == b ) || ( aSign ^ ( a < b ) );
2490

    
2491
}
2492

    
2493
/*
2494
-------------------------------------------------------------------------------
2495
Returns 1 if the double-precision floating-point value `a' is less than
2496
the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2497
exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2498
Standard for Binary Floating-point Arithmetic.
2499
-------------------------------------------------------------------------------
2500
*/
2501
flag float64_lt_quiet( float64 a, float64 b )
2502
{
2503
    flag aSign, bSign;
2504

    
2505
    if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2506
         || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2507
       ) {
2508
        if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2509
            float_raise( float_flag_invalid );
2510
        }
2511
        return 0;
2512
    }
2513
    aSign = extractFloat64Sign( a );
2514
    bSign = extractFloat64Sign( b );
2515
    if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2516
    return ( a != b ) && ( aSign ^ ( a < b ) );
2517

    
2518
}
2519

    
2520
#ifdef FLOATX80
2521

    
2522
/*
2523
-------------------------------------------------------------------------------
2524
Returns the result of converting the extended double-precision floating-
2525
point value `a' to the 32-bit two's complement integer format.  The
2526
conversion is performed according to the IEC/IEEE Standard for Binary
2527
Floating-point Arithmetic---which means in particular that the conversion
2528
is rounded according to the current rounding mode.  If `a' is a NaN, the
2529
largest positive integer is returned.  Otherwise, if the conversion
2530
overflows, the largest integer with the same sign as `a' is returned.
2531
-------------------------------------------------------------------------------
2532
*/
2533
int32 floatx80_to_int32( floatx80 a )
2534
{
2535
    flag aSign;
2536
    int32 aExp, shiftCount;
2537
    bits64 aSig;
2538

    
2539
    aSig = extractFloatx80Frac( a );
2540
    aExp = extractFloatx80Exp( a );
2541
    aSign = extractFloatx80Sign( a );
2542
    if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2543
    shiftCount = 0x4037 - aExp;
2544
    if ( shiftCount <= 0 ) shiftCount = 1;
2545
    shift64RightJamming( aSig, shiftCount, &aSig );
2546
    return roundAndPackInt32( aSign, aSig );
2547

    
2548
}
2549

    
2550
/*
2551
-------------------------------------------------------------------------------
2552
Returns the result of converting the extended double-precision floating-
2553
point value `a' to the 32-bit two's complement integer format.  The
2554
conversion is performed according to the IEC/IEEE Standard for Binary
2555
Floating-point Arithmetic, except that the conversion is always rounded
2556
toward zero.  If `a' is a NaN, the largest positive integer is returned.
2557
Otherwise, if the conversion overflows, the largest integer with the same
2558
sign as `a' is returned.
2559
-------------------------------------------------------------------------------
2560
*/
2561
int32 floatx80_to_int32_round_to_zero( floatx80 a )
2562
{
2563
    flag aSign;
2564
    int32 aExp, shiftCount;
2565
    bits64 aSig, savedASig;
2566
    int32 z;
2567

    
2568
    aSig = extractFloatx80Frac( a );
2569
    aExp = extractFloatx80Exp( a );
2570
    aSign = extractFloatx80Sign( a );
2571
    shiftCount = 0x403E - aExp;
2572
    if ( shiftCount < 32 ) {
2573
        if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2574
        goto invalid;
2575
    }
2576
    else if ( 63 < shiftCount ) {
2577
        if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2578
        return 0;
2579
    }
2580
    savedASig = aSig;
2581
    aSig >>= shiftCount;
2582
    z = aSig;
2583
    if ( aSign ) z = - z;
2584
    if ( ( z < 0 ) ^ aSign ) {
2585
 invalid:
2586
        float_exception_flags |= float_flag_invalid;
2587
        return aSign ? 0x80000000 : 0x7FFFFFFF;
2588
    }
2589
    if ( ( aSig<<shiftCount ) != savedASig ) {
2590
        float_exception_flags |= float_flag_inexact;
2591
    }
2592
    return z;
2593

    
2594
}
2595

    
2596
/*
2597
-------------------------------------------------------------------------------
2598
Returns the result of converting the extended double-precision floating-
2599
point value `a' to the single-precision floating-point format.  The
2600
conversion is performed according to the IEC/IEEE Standard for Binary
2601
Floating-point Arithmetic.
2602
-------------------------------------------------------------------------------
2603
*/
2604
float32 floatx80_to_float32( floatx80 a )
2605
{
2606
    flag aSign;
2607
    int32 aExp;
2608
    bits64 aSig;
2609

    
2610
    aSig = extractFloatx80Frac( a );
2611
    aExp = extractFloatx80Exp( a );
2612
    aSign = extractFloatx80Sign( a );
2613
    if ( aExp == 0x7FFF ) {
2614
        if ( (bits64) ( aSig<<1 ) ) {
2615
            return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2616
        }
2617
        return packFloat32( aSign, 0xFF, 0 );
2618
    }
2619
    shift64RightJamming( aSig, 33, &aSig );
2620
    if ( aExp || aSig ) aExp -= 0x3F81;
2621
    return roundAndPackFloat32( aSign, aExp, aSig );
2622

    
2623
}
2624

    
2625
/*
2626
-------------------------------------------------------------------------------
2627
Returns the result of converting the extended double-precision floating-
2628
point value `a' to the double-precision floating-point format.  The
2629
conversion is performed according to the IEC/IEEE Standard for Binary
2630
Floating-point Arithmetic.
2631
-------------------------------------------------------------------------------
2632
*/
2633
float64 floatx80_to_float64( floatx80 a )
2634
{
2635
    flag aSign;
2636
    int32 aExp;
2637
    bits64 aSig, zSig;
2638

    
2639
    aSig = extractFloatx80Frac( a );
2640
    aExp = extractFloatx80Exp( a );
2641
    aSign = extractFloatx80Sign( a );
2642
    if ( aExp == 0x7FFF ) {
2643
        if ( (bits64) ( aSig<<1 ) ) {
2644
            return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2645
        }
2646
        return packFloat64( aSign, 0x7FF, 0 );
2647
    }
2648
    shift64RightJamming( aSig, 1, &zSig );
2649
    if ( aExp || aSig ) aExp -= 0x3C01;
2650
    return roundAndPackFloat64( aSign, aExp, zSig );
2651

    
2652
}
2653

    
2654
/*
2655
-------------------------------------------------------------------------------
2656
Rounds the extended double-precision floating-point value `a' to an integer,
2657
and returns the result as an extended quadruple-precision floating-point
2658
value.  The operation is performed according to the IEC/IEEE Standard for
2659
Binary Floating-point Arithmetic.
2660
-------------------------------------------------------------------------------
2661
*/
2662
floatx80 floatx80_round_to_int( floatx80 a )
2663
{
2664
    flag aSign;
2665
    int32 aExp;
2666
    bits64 lastBitMask, roundBitsMask;
2667
    int8 roundingMode;
2668
    floatx80 z;
2669

    
2670
    aExp = extractFloatx80Exp( a );
2671
    if ( 0x403E <= aExp ) {
2672
        if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2673
            return propagateFloatx80NaN( a, a );
2674
        }
2675
        return a;
2676
    }
2677
    if ( aExp <= 0x3FFE ) {
2678
        if (    ( aExp == 0 )
2679
             && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2680
            return a;
2681
        }
2682
        float_exception_flags |= float_flag_inexact;
2683
        aSign = extractFloatx80Sign( a );
2684
        switch ( float_rounding_mode ) {
2685
         case float_round_nearest_even:
2686
            if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2687
               ) {
2688
                return
2689
                    packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2690
            }
2691
            break;
2692
         case float_round_down:
2693
            return
2694
                  aSign ?
2695
                      packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2696
                : packFloatx80( 0, 0, 0 );
2697
         case float_round_up:
2698
            return
2699
                  aSign ? packFloatx80( 1, 0, 0 )
2700
                : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2701
        }
2702
        return packFloatx80( aSign, 0, 0 );
2703
    }
2704
    lastBitMask = 1;
2705
    lastBitMask <<= 0x403E - aExp;
2706
    roundBitsMask = lastBitMask - 1;
2707
    z = a;
2708
    roundingMode = float_rounding_mode;
2709
    if ( roundingMode == float_round_nearest_even ) {
2710
        z.low += lastBitMask>>1;
2711
        if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2712
    }
2713
    else if ( roundingMode != float_round_to_zero ) {
2714
        if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2715
            z.low += roundBitsMask;
2716
        }
2717
    }
2718
    z.low &= ~ roundBitsMask;
2719
    if ( z.low == 0 ) {
2720
        ++z.high;
2721
        z.low = LIT64( 0x8000000000000000 );
2722
    }
2723
    if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2724
    return z;
2725

    
2726
}
2727

    
2728
/*
2729
-------------------------------------------------------------------------------
2730
Returns the result of adding the absolute values of the extended double-
2731
precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2732
negated before being returned.  `zSign' is ignored if the result is a NaN.
2733
The addition is performed according to the IEC/IEEE Standard for Binary
2734
Floating-point Arithmetic.
2735
-------------------------------------------------------------------------------
2736
*/
2737
static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2738
{
2739
    int32 aExp, bExp, zExp;
2740
    bits64 aSig, bSig, zSig0, zSig1;
2741
    int32 expDiff;
2742

    
2743
    aSig = extractFloatx80Frac( a );
2744
    aExp = extractFloatx80Exp( a );
2745
    bSig = extractFloatx80Frac( b );
2746
    bExp = extractFloatx80Exp( b );
2747
    expDiff = aExp - bExp;
2748
    if ( 0 < expDiff ) {
2749
        if ( aExp == 0x7FFF ) {
2750
            if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2751
            return a;
2752
        }
2753
        if ( bExp == 0 ) --expDiff;
2754
        shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2755
        zExp = aExp;
2756
    }
2757
    else if ( expDiff < 0 ) {
2758
        if ( bExp == 0x7FFF ) {
2759
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2760
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2761
        }
2762
        if ( aExp == 0 ) ++expDiff;
2763
        shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2764
        zExp = bExp;
2765
    }
2766
    else {
2767
        if ( aExp == 0x7FFF ) {
2768
            if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2769
                return propagateFloatx80NaN( a, b );
2770
            }
2771
            return a;
2772
        }
2773
        zSig1 = 0;
2774
        zSig0 = aSig + bSig;
2775
        if ( aExp == 0 ) {
2776
            normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2777
            goto roundAndPack;
2778
        }
2779
        zExp = aExp;
2780
        goto shiftRight1;
2781
    }
2782
    
2783
    zSig0 = aSig + bSig;
2784

    
2785
    if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 
2786
 shiftRight1:
2787
    shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2788
    zSig0 |= LIT64( 0x8000000000000000 );
2789
    ++zExp;
2790
 roundAndPack:
2791
    return
2792
        roundAndPackFloatx80(
2793
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2794

    
2795
}
2796

    
2797
/*
2798
-------------------------------------------------------------------------------
2799
Returns the result of subtracting the absolute values of the extended
2800
double-precision floating-point values `a' and `b'.  If `zSign' is true,
2801
the difference is negated before being returned.  `zSign' is ignored if the
2802
result is a NaN.  The subtraction is performed according to the IEC/IEEE
2803
Standard for Binary Floating-point Arithmetic.
2804
-------------------------------------------------------------------------------
2805
*/
2806
static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2807
{
2808
    int32 aExp, bExp, zExp;
2809
    bits64 aSig, bSig, zSig0, zSig1;
2810
    int32 expDiff;
2811
    floatx80 z;
2812

    
2813
    aSig = extractFloatx80Frac( a );
2814
    aExp = extractFloatx80Exp( a );
2815
    bSig = extractFloatx80Frac( b );
2816
    bExp = extractFloatx80Exp( b );
2817
    expDiff = aExp - bExp;
2818
    if ( 0 < expDiff ) goto aExpBigger;
2819
    if ( expDiff < 0 ) goto bExpBigger;
2820
    if ( aExp == 0x7FFF ) {
2821
        if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2822
            return propagateFloatx80NaN( a, b );
2823
        }
2824
        float_raise( float_flag_invalid );
2825
        z.low = floatx80_default_nan_low;
2826
        z.high = floatx80_default_nan_high;
2827
        return z;
2828
    }
2829
    if ( aExp == 0 ) {
2830
        aExp = 1;
2831
        bExp = 1;
2832
    }
2833
    zSig1 = 0;
2834
    if ( bSig < aSig ) goto aBigger;
2835
    if ( aSig < bSig ) goto bBigger;
2836
    return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2837
 bExpBigger:
2838
    if ( bExp == 0x7FFF ) {
2839
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2840
        return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2841
    }
2842
    if ( aExp == 0 ) ++expDiff;
2843
    shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2844
 bBigger:
2845
    sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2846
    zExp = bExp;
2847
    zSign ^= 1;
2848
    goto normalizeRoundAndPack;
2849
 aExpBigger:
2850
    if ( aExp == 0x7FFF ) {
2851
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2852
        return a;
2853
    }
2854
    if ( bExp == 0 ) --expDiff;
2855
    shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2856
 aBigger:
2857
    sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2858
    zExp = aExp;
2859
 normalizeRoundAndPack:
2860
    return
2861
        normalizeRoundAndPackFloatx80(
2862
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2863

    
2864
}
2865

    
2866
/*
2867
-------------------------------------------------------------------------------
2868
Returns the result of adding the extended double-precision floating-point
2869
values `a' and `b'.  The operation is performed according to the IEC/IEEE
2870
Standard for Binary Floating-point Arithmetic.
2871
-------------------------------------------------------------------------------
2872
*/
2873
floatx80 floatx80_add( floatx80 a, floatx80 b )
2874
{
2875
    flag aSign, bSign;
2876
    
2877
    aSign = extractFloatx80Sign( a );
2878
    bSign = extractFloatx80Sign( b );
2879
    if ( aSign == bSign ) {
2880
        return addFloatx80Sigs( a, b, aSign );
2881
    }
2882
    else {
2883
        return subFloatx80Sigs( a, b, aSign );
2884
    }
2885
    
2886
}
2887

    
2888
/*
2889
-------------------------------------------------------------------------------
2890
Returns the result of subtracting the extended double-precision floating-
2891
point values `a' and `b'.  The operation is performed according to the
2892
IEC/IEEE Standard for Binary Floating-point Arithmetic.
2893
-------------------------------------------------------------------------------
2894
*/
2895
floatx80 floatx80_sub( floatx80 a, floatx80 b )
2896
{
2897
    flag aSign, bSign;
2898

    
2899
    aSign = extractFloatx80Sign( a );
2900
    bSign = extractFloatx80Sign( b );
2901
    if ( aSign == bSign ) {
2902
        return subFloatx80Sigs( a, b, aSign );
2903
    }
2904
    else {
2905
        return addFloatx80Sigs( a, b, aSign );
2906
    }
2907

    
2908
}
2909

    
2910
/*
2911
-------------------------------------------------------------------------------
2912
Returns the result of multiplying the extended double-precision floating-
2913
point values `a' and `b'.  The operation is performed according to the
2914
IEC/IEEE Standard for Binary Floating-point Arithmetic.
2915
-------------------------------------------------------------------------------
2916
*/
2917
floatx80 floatx80_mul( floatx80 a, floatx80 b )
2918
{
2919
    flag aSign, bSign, zSign;
2920
    int32 aExp, bExp, zExp;
2921
    bits64 aSig, bSig, zSig0, zSig1;
2922
    floatx80 z;
2923

    
2924
    aSig = extractFloatx80Frac( a );
2925
    aExp = extractFloatx80Exp( a );
2926
    aSign = extractFloatx80Sign( a );
2927
    bSig = extractFloatx80Frac( b );
2928
    bExp = extractFloatx80Exp( b );
2929
    bSign = extractFloatx80Sign( b );
2930
    zSign = aSign ^ bSign;
2931
    if ( aExp == 0x7FFF ) {
2932
        if (    (bits64) ( aSig<<1 )
2933
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2934
            return propagateFloatx80NaN( a, b );
2935
        }
2936
        if ( ( bExp | bSig ) == 0 ) goto invalid;
2937
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2938
    }
2939
    if ( bExp == 0x7FFF ) {
2940
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2941
        if ( ( aExp | aSig ) == 0 ) {
2942
 invalid:
2943
            float_raise( float_flag_invalid );
2944
            z.low = floatx80_default_nan_low;
2945
            z.high = floatx80_default_nan_high;
2946
            return z;
2947
        }
2948
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2949
    }
2950
    if ( aExp == 0 ) {
2951
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2952
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2953
    }
2954
    if ( bExp == 0 ) {
2955
        if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2956
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2957
    }
2958
    zExp = aExp + bExp - 0x3FFE;
2959
    mul64To128( aSig, bSig, &zSig0, &zSig1 );
2960
    if ( 0 < (sbits64) zSig0 ) {
2961
        shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2962
        --zExp;
2963
    }
2964
    return
2965
        roundAndPackFloatx80(
2966
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2967

    
2968
}
2969

    
2970
/*
2971
-------------------------------------------------------------------------------
2972
Returns the result of dividing the extended double-precision floating-point
2973
value `a' by the corresponding value `b'.  The operation is performed
2974
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2975
-------------------------------------------------------------------------------
2976
*/
2977
floatx80 floatx80_div( floatx80 a, floatx80 b )
2978
{
2979
    flag aSign, bSign, zSign;
2980
    int32 aExp, bExp, zExp;
2981
    bits64 aSig, bSig, zSig0, zSig1;
2982
    bits64 rem0, rem1, rem2, term0, term1, term2;
2983
    floatx80 z;
2984

    
2985
    aSig = extractFloatx80Frac( a );
2986
    aExp = extractFloatx80Exp( a );
2987
    aSign = extractFloatx80Sign( a );
2988
    bSig = extractFloatx80Frac( b );
2989
    bExp = extractFloatx80Exp( b );
2990
    bSign = extractFloatx80Sign( b );
2991
    zSign = aSign ^ bSign;
2992
    if ( aExp == 0x7FFF ) {
2993
        if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2994
        if ( bExp == 0x7FFF ) {
2995
            if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2996
            goto invalid;
2997
        }
2998
        return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2999
    }
3000
    if ( bExp == 0x7FFF ) {
3001
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3002
        return packFloatx80( zSign, 0, 0 );
3003
    }
3004
    if ( bExp == 0 ) {
3005
        if ( bSig == 0 ) {
3006
            if ( ( aExp | aSig ) == 0 ) {
3007
 invalid:
3008
                float_raise( float_flag_invalid );
3009
                z.low = floatx80_default_nan_low;
3010
                z.high = floatx80_default_nan_high;
3011
                return z;
3012
            }
3013
            float_raise( float_flag_divbyzero );
3014
            return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3015
        }
3016
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3017
    }
3018
    if ( aExp == 0 ) {
3019
        if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3020
        normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3021
    }
3022
    zExp = aExp - bExp + 0x3FFE;
3023
    rem1 = 0;
3024
    if ( bSig <= aSig ) {
3025
        shift128Right( aSig, 0, 1, &aSig, &rem1 );
3026
        ++zExp;
3027
    }
3028
    zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3029
    mul64To128( bSig, zSig0, &term0, &term1 );
3030
    sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3031
    while ( (sbits64) rem0 < 0 ) {
3032
        --zSig0;
3033
        add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3034
    }
3035
    zSig1 = estimateDiv128To64( rem1, 0, bSig );
3036
    if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3037
        mul64To128( bSig, zSig1, &term1, &term2 );
3038
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3039
        while ( (sbits64) rem1 < 0 ) {
3040
            --zSig1;
3041
            add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3042
        }
3043
        zSig1 |= ( ( rem1 | rem2 ) != 0 );
3044
    }
3045
    return
3046
        roundAndPackFloatx80(
3047
            floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3048

    
3049
}
3050

    
3051
/*
3052
-------------------------------------------------------------------------------
3053
Returns the remainder of the extended double-precision floating-point value
3054
`a' with respect to the corresponding value `b'.  The operation is performed
3055
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3056
-------------------------------------------------------------------------------
3057
*/
3058
floatx80 floatx80_rem( floatx80 a, floatx80 b )
3059
{
3060
    flag aSign, bSign, zSign;
3061
    int32 aExp, bExp, expDiff;
3062
    bits64 aSig0, aSig1, bSig;
3063
    bits64 q, term0, term1, alternateASig0, alternateASig1;
3064
    floatx80 z;
3065

    
3066
    aSig0 = extractFloatx80Frac( a );
3067
    aExp = extractFloatx80Exp( a );
3068
    aSign = extractFloatx80Sign( a );
3069
    bSig = extractFloatx80Frac( b );
3070
    bExp = extractFloatx80Exp( b );
3071
    bSign = extractFloatx80Sign( b );
3072
    if ( aExp == 0x7FFF ) {
3073
        if (    (bits64) ( aSig0<<1 )
3074
             || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3075
            return propagateFloatx80NaN( a, b );
3076
        }
3077
        goto invalid;
3078
    }
3079
    if ( bExp == 0x7FFF ) {
3080
        if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3081
        return a;
3082
    }
3083
    if ( bExp == 0 ) {
3084
        if ( bSig == 0 ) {
3085
 invalid:
3086
            float_raise( float_flag_invalid );
3087
            z.low = floatx80_default_nan_low;
3088
            z.high = floatx80_default_nan_high;
3089
            return z;
3090
        }
3091
        normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3092
    }
3093
    if ( aExp == 0 ) {
3094
        if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3095
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3096
    }
3097
    bSig |= LIT64( 0x8000000000000000 );
3098
    zSign = aSign;
3099
    expDiff = aExp - bExp;
3100
    aSig1 = 0;
3101
    if ( expDiff < 0 ) {
3102
        if ( expDiff < -1 ) return a;
3103
        shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3104
        expDiff = 0;
3105
    }
3106
    q = ( bSig <= aSig0 );
3107
    if ( q ) aSig0 -= bSig;
3108
    expDiff -= 64;
3109
    while ( 0 < expDiff ) {
3110
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3111
        q = ( 2 < q ) ? q - 2 : 0;
3112
        mul64To128( bSig, q, &term0, &term1 );
3113
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3114
        shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3115
        expDiff -= 62;
3116
    }
3117
    expDiff += 64;
3118
    if ( 0 < expDiff ) {
3119
        q = estimateDiv128To64( aSig0, aSig1, bSig );
3120
        q = ( 2 < q ) ? q - 2 : 0;
3121
        q >>= 64 - expDiff;
3122
        mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3123
        sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3124
        shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3125
        while ( le128( term0, term1, aSig0, aSig1 ) ) {
3126
            ++q;
3127
            sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3128
        }
3129
    }
3130
    else {
3131
        term1 = 0;
3132
        term0 = bSig;
3133
    }
3134
    sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3135
    if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3136
         || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3137
              && ( q & 1 ) )
3138
       ) {
3139
        aSig0 = alternateASig0;
3140
        aSig1 = alternateASig1;
3141
        zSign = ! zSign;
3142
    }
3143
    return
3144
        normalizeRoundAndPackFloatx80(
3145
            80, zSign, bExp + expDiff, aSig0, aSig1 );
3146

    
3147
}
3148

    
3149
/*
3150
-------------------------------------------------------------------------------
3151
Returns the square root of the extended double-precision floating-point
3152
value `a'.  The operation is performed according to the IEC/IEEE Standard
3153
for Binary Floating-point Arithmetic.
3154
-------------------------------------------------------------------------------
3155
*/
3156
floatx80 floatx80_sqrt( floatx80 a )
3157
{
3158
    flag aSign;
3159
    int32 aExp, zExp;
3160
    bits64 aSig0, aSig1, zSig0, zSig1;
3161
    bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3162
    bits64 shiftedRem0, shiftedRem1;
3163
    floatx80 z;
3164

    
3165
    aSig0 = extractFloatx80Frac( a );
3166
    aExp = extractFloatx80Exp( a );
3167
    aSign = extractFloatx80Sign( a );
3168
    if ( aExp == 0x7FFF ) {
3169
        if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3170
        if ( ! aSign ) return a;
3171
        goto invalid;
3172
    }
3173
    if ( aSign ) {
3174
        if ( ( aExp | aSig0 ) == 0 ) return a;
3175
 invalid:
3176
        float_raise( float_flag_invalid );
3177
        z.low = floatx80_default_nan_low;
3178
        z.high = floatx80_default_nan_high;
3179
        return z;
3180
    }
3181
    if ( aExp == 0 ) {
3182
        if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3183
        normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3184
    }
3185
    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3186
    zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3187
    zSig0 <<= 31;
3188
    aSig1 = 0;
3189
    shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3190
    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3191
    if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3192
    shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3193
    mul64To128( zSig0, zSig0, &term0, &term1 );
3194
    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3195
    while ( (sbits64) rem0 < 0 ) {
3196
        --zSig0;
3197
        shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3198
        term1 |= 1;
3199
        add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3200
    }
3201
    shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3202
    zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3203
    if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3204
        if ( zSig1 == 0 ) zSig1 = 1;
3205
        mul64To128( zSig0, zSig1, &term1, &term2 );
3206
        shortShift128Left( term1, term2, 1, &term1, &term2 );
3207
        sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3208
        mul64To128( zSig1, zSig1, &term2, &term3 );
3209
        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3210
        while ( (sbits64) rem1 < 0 ) {
3211
            --zSig1;
3212
            shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3213
            term3 |= 1;
3214
            add192(
3215
                rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3216
        }
3217
        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3218
    }
3219
    return
3220
        roundAndPackFloatx80(
3221
            floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3222

    
3223
}
3224

    
3225
/*
3226
-------------------------------------------------------------------------------
3227
Returns 1 if the extended double-precision floating-point value `a' is
3228
equal to the corresponding value `b', and 0 otherwise.  The comparison is
3229
performed according to the IEC/IEEE Standard for Binary Floating-point
3230
Arithmetic.
3231
-------------------------------------------------------------------------------
3232
*/
3233
flag floatx80_eq( floatx80 a, floatx80 b )
3234
{
3235

    
3236
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3237
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3238
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3239
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3240
       ) {
3241
        if (    floatx80_is_signaling_nan( a )
3242
             || floatx80_is_signaling_nan( b ) ) {
3243
            float_raise( float_flag_invalid );
3244
        }
3245
        return 0;
3246
    }
3247
    return
3248
           ( a.low == b.low )
3249
        && (    ( a.high == b.high )
3250
             || (    ( a.low == 0 )
3251
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3252
           );
3253

    
3254
}
3255

    
3256
/*
3257
-------------------------------------------------------------------------------
3258
Returns 1 if the extended double-precision floating-point value `a' is
3259
less than or equal to the corresponding value `b', and 0 otherwise.  The
3260
comparison is performed according to the IEC/IEEE Standard for Binary
3261
Floating-point Arithmetic.
3262
-------------------------------------------------------------------------------
3263
*/
3264
flag floatx80_le( floatx80 a, floatx80 b )
3265
{
3266
    flag aSign, bSign;
3267

    
3268
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3269
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3270
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3271
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3272
       ) {
3273
        float_raise( float_flag_invalid );
3274
        return 0;
3275
    }
3276
    aSign = extractFloatx80Sign( a );
3277
    bSign = extractFloatx80Sign( b );
3278
    if ( aSign != bSign ) {
3279
        return
3280
               aSign
3281
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3282
                 == 0 );
3283
    }
3284
    return
3285
          aSign ? le128( b.high, b.low, a.high, a.low )
3286
        : le128( a.high, a.low, b.high, b.low );
3287

    
3288
}
3289

    
3290
/*
3291
-------------------------------------------------------------------------------
3292
Returns 1 if the extended double-precision floating-point value `a' is
3293
less than the corresponding value `b', and 0 otherwise.  The comparison
3294
is performed according to the IEC/IEEE Standard for Binary Floating-point
3295
Arithmetic.
3296
-------------------------------------------------------------------------------
3297
*/
3298
flag floatx80_lt( floatx80 a, floatx80 b )
3299
{
3300
    flag aSign, bSign;
3301

    
3302
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3303
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3304
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3305
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3306
       ) {
3307
        float_raise( float_flag_invalid );
3308
        return 0;
3309
    }
3310
    aSign = extractFloatx80Sign( a );
3311
    bSign = extractFloatx80Sign( b );
3312
    if ( aSign != bSign ) {
3313
        return
3314
               aSign
3315
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3316
                 != 0 );
3317
    }
3318
    return
3319
          aSign ? lt128( b.high, b.low, a.high, a.low )
3320
        : lt128( a.high, a.low, b.high, b.low );
3321

    
3322
}
3323

    
3324
/*
3325
-------------------------------------------------------------------------------
3326
Returns 1 if the extended double-precision floating-point value `a' is equal
3327
to the corresponding value `b', and 0 otherwise.  The invalid exception is
3328
raised if either operand is a NaN.  Otherwise, the comparison is performed
3329
according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3330
-------------------------------------------------------------------------------
3331
*/
3332
flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3333
{
3334

    
3335
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3336
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3337
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3338
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3339
       ) {
3340
        float_raise( float_flag_invalid );
3341
        return 0;
3342
    }
3343
    return
3344
           ( a.low == b.low )
3345
        && (    ( a.high == b.high )
3346
             || (    ( a.low == 0 )
3347
                  && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3348
           );
3349

    
3350
}
3351

    
3352
/*
3353
-------------------------------------------------------------------------------
3354
Returns 1 if the extended double-precision floating-point value `a' is less
3355
than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3356
do not cause an exception.  Otherwise, the comparison is performed according
3357
to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3358
-------------------------------------------------------------------------------
3359
*/
3360
flag floatx80_le_quiet( floatx80 a, floatx80 b )
3361
{
3362
    flag aSign, bSign;
3363

    
3364
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3365
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3366
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3367
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3368
       ) {
3369
        if (    floatx80_is_signaling_nan( a )
3370
             || floatx80_is_signaling_nan( b ) ) {
3371
            float_raise( float_flag_invalid );
3372
        }
3373
        return 0;
3374
    }
3375
    aSign = extractFloatx80Sign( a );
3376
    bSign = extractFloatx80Sign( b );
3377
    if ( aSign != bSign ) {
3378
        return
3379
               aSign
3380
            || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3381
                 == 0 );
3382
    }
3383
    return
3384
          aSign ? le128( b.high, b.low, a.high, a.low )
3385
        : le128( a.high, a.low, b.high, b.low );
3386

    
3387
}
3388

    
3389
/*
3390
-------------------------------------------------------------------------------
3391
Returns 1 if the extended double-precision floating-point value `a' is less
3392
than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3393
an exception.  Otherwise, the comparison is performed according to the
3394
IEC/IEEE Standard for Binary Floating-point Arithmetic.
3395
-------------------------------------------------------------------------------
3396
*/
3397
flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3398
{
3399
    flag aSign, bSign;
3400

    
3401
    if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3402
              && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3403
         || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3404
              && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3405
       ) {
3406
        if (    floatx80_is_signaling_nan( a )
3407
             || floatx80_is_signaling_nan( b ) ) {
3408
            float_raise( float_flag_invalid );
3409
        }
3410
        return 0;
3411
    }
3412
    aSign = extractFloatx80Sign( a );
3413
    bSign = extractFloatx80Sign( b );
3414
    if ( aSign != bSign ) {
3415
        return
3416
               aSign
3417
            && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3418
                 != 0 );
3419
    }
3420
    return
3421
          aSign ? lt128( b.high, b.low, a.high, a.low )
3422
        : lt128( a.high, a.low, b.high, b.low );
3423

    
3424
}
3425

    
3426
#endif
3427