Statistics
| Branch: | Revision:

root / target-i386 / ops_sse.h @ b8b6a50b

History | View | Annotate | Download (34 kB)

1
/*
2
 *  MMX/3DNow!/SSE/SSE2/SSE3/PNI support
3
 *
4
 *  Copyright (c) 2005 Fabrice Bellard
5
 *
6
 * This library is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2 of the License, or (at your option) any later version.
10
 *
11
 * This library is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
 */
20
#if SHIFT == 0
21
#define Reg MMXReg
22
#define XMM_ONLY(x...)
23
#define B(n) MMX_B(n)
24
#define W(n) MMX_W(n)
25
#define L(n) MMX_L(n)
26
#define Q(n) q
27
#define SUFFIX _mmx
28
#else
29
#define Reg XMMReg
30
#define XMM_ONLY(x...) x
31
#define B(n) XMM_B(n)
32
#define W(n) XMM_W(n)
33
#define L(n) XMM_L(n)
34
#define Q(n) XMM_Q(n)
35
#define SUFFIX _xmm
36
#endif
37

    
38
void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s)
39
{
40
    int shift;
41

    
42
    if (s->Q(0) > 15) {
43
        d->Q(0) = 0;
44
#if SHIFT == 1
45
        d->Q(1) = 0;
46
#endif
47
    } else {
48
        shift = s->B(0);
49
        d->W(0) >>= shift;
50
        d->W(1) >>= shift;
51
        d->W(2) >>= shift;
52
        d->W(3) >>= shift;
53
#if SHIFT == 1
54
        d->W(4) >>= shift;
55
        d->W(5) >>= shift;
56
        d->W(6) >>= shift;
57
        d->W(7) >>= shift;
58
#endif
59
    }
60
    FORCE_RET();
61
}
62

    
63
void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s)
64
{
65
    int shift;
66

    
67
    if (s->Q(0) > 15) {
68
        shift = 15;
69
    } else {
70
        shift = s->B(0);
71
    }
72
    d->W(0) = (int16_t)d->W(0) >> shift;
73
    d->W(1) = (int16_t)d->W(1) >> shift;
74
    d->W(2) = (int16_t)d->W(2) >> shift;
75
    d->W(3) = (int16_t)d->W(3) >> shift;
76
#if SHIFT == 1
77
    d->W(4) = (int16_t)d->W(4) >> shift;
78
    d->W(5) = (int16_t)d->W(5) >> shift;
79
    d->W(6) = (int16_t)d->W(6) >> shift;
80
    d->W(7) = (int16_t)d->W(7) >> shift;
81
#endif
82
}
83

    
84
void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s)
85
{
86
    int shift;
87

    
88
    if (s->Q(0) > 15) {
89
        d->Q(0) = 0;
90
#if SHIFT == 1
91
        d->Q(1) = 0;
92
#endif
93
    } else {
94
        shift = s->B(0);
95
        d->W(0) <<= shift;
96
        d->W(1) <<= shift;
97
        d->W(2) <<= shift;
98
        d->W(3) <<= shift;
99
#if SHIFT == 1
100
        d->W(4) <<= shift;
101
        d->W(5) <<= shift;
102
        d->W(6) <<= shift;
103
        d->W(7) <<= shift;
104
#endif
105
    }
106
    FORCE_RET();
107
}
108

    
109
void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s)
110
{
111
    int shift;
112

    
113
    if (s->Q(0) > 31) {
114
        d->Q(0) = 0;
115
#if SHIFT == 1
116
        d->Q(1) = 0;
117
#endif
118
    } else {
119
        shift = s->B(0);
120
        d->L(0) >>= shift;
121
        d->L(1) >>= shift;
122
#if SHIFT == 1
123
        d->L(2) >>= shift;
124
        d->L(3) >>= shift;
125
#endif
126
    }
127
    FORCE_RET();
128
}
129

    
130
void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s)
131
{
132
    int shift;
133

    
134
    if (s->Q(0) > 31) {
135
        shift = 31;
136
    } else {
137
        shift = s->B(0);
138
    }
139
    d->L(0) = (int32_t)d->L(0) >> shift;
140
    d->L(1) = (int32_t)d->L(1) >> shift;
141
#if SHIFT == 1
142
    d->L(2) = (int32_t)d->L(2) >> shift;
143
    d->L(3) = (int32_t)d->L(3) >> shift;
144
#endif
145
}
146

    
147
void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s)
148
{
149
    int shift;
150

    
151
    if (s->Q(0) > 31) {
152
        d->Q(0) = 0;
153
#if SHIFT == 1
154
        d->Q(1) = 0;
155
#endif
156
    } else {
157
        shift = s->B(0);
158
        d->L(0) <<= shift;
159
        d->L(1) <<= shift;
160
#if SHIFT == 1
161
        d->L(2) <<= shift;
162
        d->L(3) <<= shift;
163
#endif
164
    }
165
    FORCE_RET();
166
}
167

    
168
void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s)
169
{
170
    int shift;
171

    
172
    if (s->Q(0) > 63) {
173
        d->Q(0) = 0;
174
#if SHIFT == 1
175
        d->Q(1) = 0;
176
#endif
177
    } else {
178
        shift = s->B(0);
179
        d->Q(0) >>= shift;
180
#if SHIFT == 1
181
        d->Q(1) >>= shift;
182
#endif
183
    }
184
    FORCE_RET();
185
}
186

    
187
void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s)
188
{
189
    int shift;
190

    
191
    if (s->Q(0) > 63) {
192
        d->Q(0) = 0;
193
#if SHIFT == 1
194
        d->Q(1) = 0;
195
#endif
196
    } else {
197
        shift = s->B(0);
198
        d->Q(0) <<= shift;
199
#if SHIFT == 1
200
        d->Q(1) <<= shift;
201
#endif
202
    }
203
    FORCE_RET();
204
}
205

    
206
#if SHIFT == 1
207
void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s)
208
{
209
    int shift, i;
210

    
211
    shift = s->L(0);
212
    if (shift > 16)
213
        shift = 16;
214
    for(i = 0; i < 16 - shift; i++)
215
        d->B(i) = d->B(i + shift);
216
    for(i = 16 - shift; i < 16; i++)
217
        d->B(i) = 0;
218
    FORCE_RET();
219
}
220

    
221
void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s)
222
{
223
    int shift, i;
224

    
225
    shift = s->L(0);
226
    if (shift > 16)
227
        shift = 16;
228
    for(i = 15; i >= shift; i--)
229
        d->B(i) = d->B(i - shift);
230
    for(i = 0; i < shift; i++)
231
        d->B(i) = 0;
232
    FORCE_RET();
233
}
234
#endif
235

    
236
#define SSE_HELPER_B(name, F)\
237
void glue(name, SUFFIX) (Reg *d, Reg *s)\
238
{\
239
    d->B(0) = F(d->B(0), s->B(0));\
240
    d->B(1) = F(d->B(1), s->B(1));\
241
    d->B(2) = F(d->B(2), s->B(2));\
242
    d->B(3) = F(d->B(3), s->B(3));\
243
    d->B(4) = F(d->B(4), s->B(4));\
244
    d->B(5) = F(d->B(5), s->B(5));\
245
    d->B(6) = F(d->B(6), s->B(6));\
246
    d->B(7) = F(d->B(7), s->B(7));\
247
    XMM_ONLY(\
248
    d->B(8) = F(d->B(8), s->B(8));\
249
    d->B(9) = F(d->B(9), s->B(9));\
250
    d->B(10) = F(d->B(10), s->B(10));\
251
    d->B(11) = F(d->B(11), s->B(11));\
252
    d->B(12) = F(d->B(12), s->B(12));\
253
    d->B(13) = F(d->B(13), s->B(13));\
254
    d->B(14) = F(d->B(14), s->B(14));\
255
    d->B(15) = F(d->B(15), s->B(15));\
256
    )\
257
}
258

    
259
#define SSE_HELPER_W(name, F)\
260
void glue(name, SUFFIX) (Reg *d, Reg *s)\
261
{\
262
    d->W(0) = F(d->W(0), s->W(0));\
263
    d->W(1) = F(d->W(1), s->W(1));\
264
    d->W(2) = F(d->W(2), s->W(2));\
265
    d->W(3) = F(d->W(3), s->W(3));\
266
    XMM_ONLY(\
267
    d->W(4) = F(d->W(4), s->W(4));\
268
    d->W(5) = F(d->W(5), s->W(5));\
269
    d->W(6) = F(d->W(6), s->W(6));\
270
    d->W(7) = F(d->W(7), s->W(7));\
271
    )\
272
}
273

    
274
#define SSE_HELPER_L(name, F)\
275
void glue(name, SUFFIX) (Reg *d, Reg *s)\
276
{\
277
    d->L(0) = F(d->L(0), s->L(0));\
278
    d->L(1) = F(d->L(1), s->L(1));\
279
    XMM_ONLY(\
280
    d->L(2) = F(d->L(2), s->L(2));\
281
    d->L(3) = F(d->L(3), s->L(3));\
282
    )\
283
}
284

    
285
#define SSE_HELPER_Q(name, F)\
286
void glue(name, SUFFIX) (Reg *d, Reg *s)\
287
{\
288
    d->Q(0) = F(d->Q(0), s->Q(0));\
289
    XMM_ONLY(\
290
    d->Q(1) = F(d->Q(1), s->Q(1));\
291
    )\
292
}
293

    
294
#if SHIFT == 0
295
static inline int satub(int x)
296
{
297
    if (x < 0)
298
        return 0;
299
    else if (x > 255)
300
        return 255;
301
    else
302
        return x;
303
}
304

    
305
static inline int satuw(int x)
306
{
307
    if (x < 0)
308
        return 0;
309
    else if (x > 65535)
310
        return 65535;
311
    else
312
        return x;
313
}
314

    
315
static inline int satsb(int x)
316
{
317
    if (x < -128)
318
        return -128;
319
    else if (x > 127)
320
        return 127;
321
    else
322
        return x;
323
}
324

    
325
static inline int satsw(int x)
326
{
327
    if (x < -32768)
328
        return -32768;
329
    else if (x > 32767)
330
        return 32767;
331
    else
332
        return x;
333
}
334

    
335
#define FADD(a, b) ((a) + (b))
336
#define FADDUB(a, b) satub((a) + (b))
337
#define FADDUW(a, b) satuw((a) + (b))
338
#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
339
#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
340

    
341
#define FSUB(a, b) ((a) - (b))
342
#define FSUBUB(a, b) satub((a) - (b))
343
#define FSUBUW(a, b) satuw((a) - (b))
344
#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
345
#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
346
#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
347
#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
348
#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
349
#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
350

    
351
#define FAND(a, b) (a) & (b)
352
#define FANDN(a, b) ((~(a)) & (b))
353
#define FOR(a, b) (a) | (b)
354
#define FXOR(a, b) (a) ^ (b)
355

    
356
#define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0
357
#define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0
358
#define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0
359
#define FCMPEQ(a, b) (a) == (b) ? -1 : 0
360

    
361
#define FMULLW(a, b) (a) * (b)
362
#define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16
363
#define FMULHUW(a, b) (a) * (b) >> 16
364
#define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16
365

    
366
#define FAVG(a, b) ((a) + (b) + 1) >> 1
367
#endif
368

    
369
SSE_HELPER_B(helper_paddb, FADD)
370
SSE_HELPER_W(helper_paddw, FADD)
371
SSE_HELPER_L(helper_paddl, FADD)
372
SSE_HELPER_Q(helper_paddq, FADD)
373

    
374
SSE_HELPER_B(helper_psubb, FSUB)
375
SSE_HELPER_W(helper_psubw, FSUB)
376
SSE_HELPER_L(helper_psubl, FSUB)
377
SSE_HELPER_Q(helper_psubq, FSUB)
378

    
379
SSE_HELPER_B(helper_paddusb, FADDUB)
380
SSE_HELPER_B(helper_paddsb, FADDSB)
381
SSE_HELPER_B(helper_psubusb, FSUBUB)
382
SSE_HELPER_B(helper_psubsb, FSUBSB)
383

    
384
SSE_HELPER_W(helper_paddusw, FADDUW)
385
SSE_HELPER_W(helper_paddsw, FADDSW)
386
SSE_HELPER_W(helper_psubusw, FSUBUW)
387
SSE_HELPER_W(helper_psubsw, FSUBSW)
388

    
389
SSE_HELPER_B(helper_pminub, FMINUB)
390
SSE_HELPER_B(helper_pmaxub, FMAXUB)
391

    
392
SSE_HELPER_W(helper_pminsw, FMINSW)
393
SSE_HELPER_W(helper_pmaxsw, FMAXSW)
394

    
395
SSE_HELPER_Q(helper_pand, FAND)
396
SSE_HELPER_Q(helper_pandn, FANDN)
397
SSE_HELPER_Q(helper_por, FOR)
398
SSE_HELPER_Q(helper_pxor, FXOR)
399

    
400
SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
401
SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
402
SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
403

    
404
SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
405
SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
406
SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
407

    
408
SSE_HELPER_W(helper_pmullw, FMULLW)
409
#if SHIFT == 0
410
SSE_HELPER_W(helper_pmulhrw, FMULHRW)
411
#endif
412
SSE_HELPER_W(helper_pmulhuw, FMULHUW)
413
SSE_HELPER_W(helper_pmulhw, FMULHW)
414

    
415
SSE_HELPER_B(helper_pavgb, FAVG)
416
SSE_HELPER_W(helper_pavgw, FAVG)
417

    
418
void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s)
419
{
420
    d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0);
421
#if SHIFT == 1
422
    d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2);
423
#endif
424
}
425

    
426
void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s)
427
{
428
    int i;
429

    
430
    for(i = 0; i < (2 << SHIFT); i++) {
431
        d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) +
432
            (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1);
433
    }
434
    FORCE_RET();
435
}
436

    
437
#if SHIFT == 0
438
static inline int abs1(int a)
439
{
440
    if (a < 0)
441
        return -a;
442
    else
443
        return a;
444
}
445
#endif
446
void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s)
447
{
448
    unsigned int val;
449

    
450
    val = 0;
451
    val += abs1(d->B(0) - s->B(0));
452
    val += abs1(d->B(1) - s->B(1));
453
    val += abs1(d->B(2) - s->B(2));
454
    val += abs1(d->B(3) - s->B(3));
455
    val += abs1(d->B(4) - s->B(4));
456
    val += abs1(d->B(5) - s->B(5));
457
    val += abs1(d->B(6) - s->B(6));
458
    val += abs1(d->B(7) - s->B(7));
459
    d->Q(0) = val;
460
#if SHIFT == 1
461
    val = 0;
462
    val += abs1(d->B(8) - s->B(8));
463
    val += abs1(d->B(9) - s->B(9));
464
    val += abs1(d->B(10) - s->B(10));
465
    val += abs1(d->B(11) - s->B(11));
466
    val += abs1(d->B(12) - s->B(12));
467
    val += abs1(d->B(13) - s->B(13));
468
    val += abs1(d->B(14) - s->B(14));
469
    val += abs1(d->B(15) - s->B(15));
470
    d->Q(1) = val;
471
#endif
472
}
473

    
474
void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0)
475
{
476
    int i;
477
    for(i = 0; i < (8 << SHIFT); i++) {
478
        if (s->B(i) & 0x80)
479
            stb(a0 + i, d->B(i));
480
    }
481
    FORCE_RET();
482
}
483

    
484
void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val)
485
{
486
    d->L(0) = val;
487
    d->L(1) = 0;
488
#if SHIFT == 1
489
    d->Q(1) = 0;
490
#endif
491
}
492

    
493
#ifdef TARGET_X86_64
494
void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val)
495
{
496
    d->Q(0) = val;
497
#if SHIFT == 1
498
    d->Q(1) = 0;
499
#endif
500
}
501
#endif
502

    
503
#if SHIFT == 0
504
void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order)
505
{
506
    Reg r;
507
    r.W(0) = s->W(order & 3);
508
    r.W(1) = s->W((order >> 2) & 3);
509
    r.W(2) = s->W((order >> 4) & 3);
510
    r.W(3) = s->W((order >> 6) & 3);
511
    *d = r;
512
}
513
#else
514
void helper_shufps(Reg *d, Reg *s, int order)
515
{
516
    Reg r;
517
    r.L(0) = d->L(order & 3);
518
    r.L(1) = d->L((order >> 2) & 3);
519
    r.L(2) = s->L((order >> 4) & 3);
520
    r.L(3) = s->L((order >> 6) & 3);
521
    *d = r;
522
}
523

    
524
void helper_shufpd(Reg *d, Reg *s, int order)
525
{
526
    Reg r;
527
    r.Q(0) = d->Q(order & 1);
528
    r.Q(1) = s->Q((order >> 1) & 1);
529
    *d = r;
530
}
531

    
532
void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order)
533
{
534
    Reg r;
535
    r.L(0) = s->L(order & 3);
536
    r.L(1) = s->L((order >> 2) & 3);
537
    r.L(2) = s->L((order >> 4) & 3);
538
    r.L(3) = s->L((order >> 6) & 3);
539
    *d = r;
540
}
541

    
542
void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order)
543
{
544
    Reg r;
545
    r.W(0) = s->W(order & 3);
546
    r.W(1) = s->W((order >> 2) & 3);
547
    r.W(2) = s->W((order >> 4) & 3);
548
    r.W(3) = s->W((order >> 6) & 3);
549
    r.Q(1) = s->Q(1);
550
    *d = r;
551
}
552

    
553
void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order)
554
{
555
    Reg r;
556
    r.Q(0) = s->Q(0);
557
    r.W(4) = s->W(4 + (order & 3));
558
    r.W(5) = s->W(4 + ((order >> 2) & 3));
559
    r.W(6) = s->W(4 + ((order >> 4) & 3));
560
    r.W(7) = s->W(4 + ((order >> 6) & 3));
561
    *d = r;
562
}
563
#endif
564

    
565
#if SHIFT == 1
566
/* FPU ops */
567
/* XXX: not accurate */
568

    
569
#define SSE_HELPER_S(name, F)\
570
void helper_ ## name ## ps (Reg *d, Reg *s)\
571
{\
572
    d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
573
    d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
574
    d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
575
    d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
576
}\
577
\
578
void helper_ ## name ## ss (Reg *d, Reg *s)\
579
{\
580
    d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
581
}\
582
void helper_ ## name ## pd (Reg *d, Reg *s)\
583
{\
584
    d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
585
    d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
586
}\
587
\
588
void helper_ ## name ## sd (Reg *d, Reg *s)\
589
{\
590
    d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
591
}
592

    
593
#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
594
#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
595
#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
596
#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
597
#define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b)
598
#define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b)
599
#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status)
600

    
601
SSE_HELPER_S(add, FPU_ADD)
602
SSE_HELPER_S(sub, FPU_SUB)
603
SSE_HELPER_S(mul, FPU_MUL)
604
SSE_HELPER_S(div, FPU_DIV)
605
SSE_HELPER_S(min, FPU_MIN)
606
SSE_HELPER_S(max, FPU_MAX)
607
SSE_HELPER_S(sqrt, FPU_SQRT)
608

    
609

    
610
/* float to float conversions */
611
void helper_cvtps2pd(Reg *d, Reg *s)
612
{
613
    float32 s0, s1;
614
    s0 = s->XMM_S(0);
615
    s1 = s->XMM_S(1);
616
    d->XMM_D(0) = float32_to_float64(s0, &env->sse_status);
617
    d->XMM_D(1) = float32_to_float64(s1, &env->sse_status);
618
}
619

    
620
void helper_cvtpd2ps(Reg *d, Reg *s)
621
{
622
    d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
623
    d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status);
624
    d->Q(1) = 0;
625
}
626

    
627
void helper_cvtss2sd(Reg *d, Reg *s)
628
{
629
    d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status);
630
}
631

    
632
void helper_cvtsd2ss(Reg *d, Reg *s)
633
{
634
    d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status);
635
}
636

    
637
/* integer to float */
638
void helper_cvtdq2ps(Reg *d, Reg *s)
639
{
640
    d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status);
641
    d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status);
642
    d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status);
643
    d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status);
644
}
645

    
646
void helper_cvtdq2pd(Reg *d, Reg *s)
647
{
648
    int32_t l0, l1;
649
    l0 = (int32_t)s->XMM_L(0);
650
    l1 = (int32_t)s->XMM_L(1);
651
    d->XMM_D(0) = int32_to_float64(l0, &env->sse_status);
652
    d->XMM_D(1) = int32_to_float64(l1, &env->sse_status);
653
}
654

    
655
void helper_cvtpi2ps(XMMReg *d, MMXReg *s)
656
{
657
    d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
658
    d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
659
}
660

    
661
void helper_cvtpi2pd(XMMReg *d, MMXReg *s)
662
{
663
    d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
664
    d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
665
}
666

    
667
void helper_cvtsi2ss(XMMReg *d, uint32_t val)
668
{
669
    d->XMM_S(0) = int32_to_float32(val, &env->sse_status);
670
}
671

    
672
void helper_cvtsi2sd(XMMReg *d, uint32_t val)
673
{
674
    d->XMM_D(0) = int32_to_float64(val, &env->sse_status);
675
}
676

    
677
#ifdef TARGET_X86_64
678
void helper_cvtsq2ss(XMMReg *d, uint64_t val)
679
{
680
    d->XMM_S(0) = int64_to_float32(val, &env->sse_status);
681
}
682

    
683
void helper_cvtsq2sd(XMMReg *d, uint64_t val)
684
{
685
    d->XMM_D(0) = int64_to_float64(val, &env->sse_status);
686
}
687
#endif
688

    
689
/* float to integer */
690
void helper_cvtps2dq(XMMReg *d, XMMReg *s)
691
{
692
    d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
693
    d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
694
    d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status);
695
    d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status);
696
}
697

    
698
void helper_cvtpd2dq(XMMReg *d, XMMReg *s)
699
{
700
    d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
701
    d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
702
    d->XMM_Q(1) = 0;
703
}
704

    
705
void helper_cvtps2pi(MMXReg *d, XMMReg *s)
706
{
707
    d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status);
708
    d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status);
709
}
710

    
711
void helper_cvtpd2pi(MMXReg *d, XMMReg *s)
712
{
713
    d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status);
714
    d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status);
715
}
716

    
717
int32_t helper_cvtss2si(XMMReg *s)
718
{
719
    return float32_to_int32(s->XMM_S(0), &env->sse_status);
720
}
721

    
722
int32_t helper_cvtsd2si(XMMReg *s)
723
{
724
    return float64_to_int32(s->XMM_D(0), &env->sse_status);
725
}
726

    
727
#ifdef TARGET_X86_64
728
int64_t helper_cvtss2sq(XMMReg *s)
729
{
730
    return float32_to_int64(s->XMM_S(0), &env->sse_status);
731
}
732

    
733
int64_t helper_cvtsd2sq(XMMReg *s)
734
{
735
    return float64_to_int64(s->XMM_D(0), &env->sse_status);
736
}
737
#endif
738

    
739
/* float to integer truncated */
740
void helper_cvttps2dq(XMMReg *d, XMMReg *s)
741
{
742
    d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
743
    d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
744
    d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status);
745
    d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status);
746
}
747

    
748
void helper_cvttpd2dq(XMMReg *d, XMMReg *s)
749
{
750
    d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
751
    d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
752
    d->XMM_Q(1) = 0;
753
}
754

    
755
void helper_cvttps2pi(MMXReg *d, XMMReg *s)
756
{
757
    d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
758
    d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status);
759
}
760

    
761
void helper_cvttpd2pi(MMXReg *d, XMMReg *s)
762
{
763
    d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
764
    d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status);
765
}
766

    
767
int32_t helper_cvttss2si(XMMReg *s)
768
{
769
    return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status);
770
}
771

    
772
int32_t helper_cvttsd2si(XMMReg *s)
773
{
774
    return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status);
775
}
776

    
777
#ifdef TARGET_X86_64
778
int64_t helper_cvttss2sq(XMMReg *s)
779
{
780
    return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status);
781
}
782

    
783
int64_t helper_cvttsd2sq(XMMReg *s)
784
{
785
    return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status);
786
}
787
#endif
788

    
789
void helper_rsqrtps(XMMReg *d, XMMReg *s)
790
{
791
    d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
792
    d->XMM_S(1) = approx_rsqrt(s->XMM_S(1));
793
    d->XMM_S(2) = approx_rsqrt(s->XMM_S(2));
794
    d->XMM_S(3) = approx_rsqrt(s->XMM_S(3));
795
}
796

    
797
void helper_rsqrtss(XMMReg *d, XMMReg *s)
798
{
799
    d->XMM_S(0) = approx_rsqrt(s->XMM_S(0));
800
}
801

    
802
void helper_rcpps(XMMReg *d, XMMReg *s)
803
{
804
    d->XMM_S(0) = approx_rcp(s->XMM_S(0));
805
    d->XMM_S(1) = approx_rcp(s->XMM_S(1));
806
    d->XMM_S(2) = approx_rcp(s->XMM_S(2));
807
    d->XMM_S(3) = approx_rcp(s->XMM_S(3));
808
}
809

    
810
void helper_rcpss(XMMReg *d, XMMReg *s)
811
{
812
    d->XMM_S(0) = approx_rcp(s->XMM_S(0));
813
}
814

    
815
void helper_haddps(XMMReg *d, XMMReg *s)
816
{
817
    XMMReg r;
818
    r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1);
819
    r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3);
820
    r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1);
821
    r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3);
822
    *d = r;
823
}
824

    
825
void helper_haddpd(XMMReg *d, XMMReg *s)
826
{
827
    XMMReg r;
828
    r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1);
829
    r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1);
830
    *d = r;
831
}
832

    
833
void helper_hsubps(XMMReg *d, XMMReg *s)
834
{
835
    XMMReg r;
836
    r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1);
837
    r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3);
838
    r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1);
839
    r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3);
840
    *d = r;
841
}
842

    
843
void helper_hsubpd(XMMReg *d, XMMReg *s)
844
{
845
    XMMReg r;
846
    r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1);
847
    r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1);
848
    *d = r;
849
}
850

    
851
void helper_addsubps(XMMReg *d, XMMReg *s)
852
{
853
    d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0);
854
    d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1);
855
    d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2);
856
    d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3);
857
}
858

    
859
void helper_addsubpd(XMMReg *d, XMMReg *s)
860
{
861
    d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0);
862
    d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1);
863
}
864

    
865
/* XXX: unordered */
866
#define SSE_HELPER_CMP(name, F)\
867
void helper_ ## name ## ps (Reg *d, Reg *s)\
868
{\
869
    d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
870
    d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\
871
    d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\
872
    d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\
873
}\
874
\
875
void helper_ ## name ## ss (Reg *d, Reg *s)\
876
{\
877
    d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\
878
}\
879
void helper_ ## name ## pd (Reg *d, Reg *s)\
880
{\
881
    d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
882
    d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\
883
}\
884
\
885
void helper_ ## name ## sd (Reg *d, Reg *s)\
886
{\
887
    d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\
888
}
889

    
890
#define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0
891
#define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0
892
#define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0
893
#define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0
894
#define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1
895
#define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1
896
#define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1
897
#define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1
898

    
899
SSE_HELPER_CMP(cmpeq, FPU_CMPEQ)
900
SSE_HELPER_CMP(cmplt, FPU_CMPLT)
901
SSE_HELPER_CMP(cmple, FPU_CMPLE)
902
SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD)
903
SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ)
904
SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT)
905
SSE_HELPER_CMP(cmpnle, FPU_CMPNLE)
906
SSE_HELPER_CMP(cmpord, FPU_CMPORD)
907

    
908
const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
909

    
910
void helper_ucomiss(Reg *d, Reg *s)
911
{
912
    int ret;
913
    float32 s0, s1;
914

    
915
    s0 = d->XMM_S(0);
916
    s1 = s->XMM_S(0);
917
    ret = float32_compare_quiet(s0, s1, &env->sse_status);
918
    CC_SRC = comis_eflags[ret + 1];
919
    FORCE_RET();
920
}
921

    
922
void helper_comiss(Reg *d, Reg *s)
923
{
924
    int ret;
925
    float32 s0, s1;
926

    
927
    s0 = d->XMM_S(0);
928
    s1 = s->XMM_S(0);
929
    ret = float32_compare(s0, s1, &env->sse_status);
930
    CC_SRC = comis_eflags[ret + 1];
931
    FORCE_RET();
932
}
933

    
934
void helper_ucomisd(Reg *d, Reg *s)
935
{
936
    int ret;
937
    float64 d0, d1;
938

    
939
    d0 = d->XMM_D(0);
940
    d1 = s->XMM_D(0);
941
    ret = float64_compare_quiet(d0, d1, &env->sse_status);
942
    CC_SRC = comis_eflags[ret + 1];
943
    FORCE_RET();
944
}
945

    
946
void helper_comisd(Reg *d, Reg *s)
947
{
948
    int ret;
949
    float64 d0, d1;
950

    
951
    d0 = d->XMM_D(0);
952
    d1 = s->XMM_D(0);
953
    ret = float64_compare(d0, d1, &env->sse_status);
954
    CC_SRC = comis_eflags[ret + 1];
955
    FORCE_RET();
956
}
957

    
958
uint32_t helper_movmskps(Reg *s)
959
{
960
    int b0, b1, b2, b3;
961
    b0 = s->XMM_L(0) >> 31;
962
    b1 = s->XMM_L(1) >> 31;
963
    b2 = s->XMM_L(2) >> 31;
964
    b3 = s->XMM_L(3) >> 31;
965
    return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
966
}
967

    
968
uint32_t helper_movmskpd(Reg *s)
969
{
970
    int b0, b1;
971
    b0 = s->XMM_L(1) >> 31;
972
    b1 = s->XMM_L(3) >> 31;
973
    return b0 | (b1 << 1);
974
}
975

    
976
#endif
977

    
978
uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s)
979
{
980
    uint32_t val;
981
    val = 0;
982
    val |= (s->XMM_B(0) >> 7);
983
    val |= (s->XMM_B(1) >> 6) & 0x02;
984
    val |= (s->XMM_B(2) >> 5) & 0x04;
985
    val |= (s->XMM_B(3) >> 4) & 0x08;
986
    val |= (s->XMM_B(4) >> 3) & 0x10;
987
    val |= (s->XMM_B(5) >> 2) & 0x20;
988
    val |= (s->XMM_B(6) >> 1) & 0x40;
989
    val |= (s->XMM_B(7)) & 0x80;
990
#if SHIFT == 1
991
    val |= (s->XMM_B(8) << 1) & 0x0100;
992
    val |= (s->XMM_B(9) << 2) & 0x0200;
993
    val |= (s->XMM_B(10) << 3) & 0x0400;
994
    val |= (s->XMM_B(11) << 4) & 0x0800;
995
    val |= (s->XMM_B(12) << 5) & 0x1000;
996
    val |= (s->XMM_B(13) << 6) & 0x2000;
997
    val |= (s->XMM_B(14) << 7) & 0x4000;
998
    val |= (s->XMM_B(15) << 8) & 0x8000;
999
#endif
1000
    return val;
1001
}
1002

    
1003
void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s)
1004
{
1005
    Reg r;
1006

    
1007
    r.B(0) = satsb((int16_t)d->W(0));
1008
    r.B(1) = satsb((int16_t)d->W(1));
1009
    r.B(2) = satsb((int16_t)d->W(2));
1010
    r.B(3) = satsb((int16_t)d->W(3));
1011
#if SHIFT == 1
1012
    r.B(4) = satsb((int16_t)d->W(4));
1013
    r.B(5) = satsb((int16_t)d->W(5));
1014
    r.B(6) = satsb((int16_t)d->W(6));
1015
    r.B(7) = satsb((int16_t)d->W(7));
1016
#endif
1017
    r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
1018
    r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
1019
    r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
1020
    r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
1021
#if SHIFT == 1
1022
    r.B(12) = satsb((int16_t)s->W(4));
1023
    r.B(13) = satsb((int16_t)s->W(5));
1024
    r.B(14) = satsb((int16_t)s->W(6));
1025
    r.B(15) = satsb((int16_t)s->W(7));
1026
#endif
1027
    *d = r;
1028
}
1029

    
1030
void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s)
1031
{
1032
    Reg r;
1033

    
1034
    r.B(0) = satub((int16_t)d->W(0));
1035
    r.B(1) = satub((int16_t)d->W(1));
1036
    r.B(2) = satub((int16_t)d->W(2));
1037
    r.B(3) = satub((int16_t)d->W(3));
1038
#if SHIFT == 1
1039
    r.B(4) = satub((int16_t)d->W(4));
1040
    r.B(5) = satub((int16_t)d->W(5));
1041
    r.B(6) = satub((int16_t)d->W(6));
1042
    r.B(7) = satub((int16_t)d->W(7));
1043
#endif
1044
    r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
1045
    r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
1046
    r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
1047
    r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
1048
#if SHIFT == 1
1049
    r.B(12) = satub((int16_t)s->W(4));
1050
    r.B(13) = satub((int16_t)s->W(5));
1051
    r.B(14) = satub((int16_t)s->W(6));
1052
    r.B(15) = satub((int16_t)s->W(7));
1053
#endif
1054
    *d = r;
1055
}
1056

    
1057
void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s)
1058
{
1059
    Reg r;
1060

    
1061
    r.W(0) = satsw(d->L(0));
1062
    r.W(1) = satsw(d->L(1));
1063
#if SHIFT == 1
1064
    r.W(2) = satsw(d->L(2));
1065
    r.W(3) = satsw(d->L(3));
1066
#endif
1067
    r.W((2 << SHIFT) + 0) = satsw(s->L(0));
1068
    r.W((2 << SHIFT) + 1) = satsw(s->L(1));
1069
#if SHIFT == 1
1070
    r.W(6) = satsw(s->L(2));
1071
    r.W(7) = satsw(s->L(3));
1072
#endif
1073
    *d = r;
1074
}
1075

    
1076
#define UNPCK_OP(base_name, base)                               \
1077
                                                                \
1078
void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s)   \
1079
{                                                               \
1080
    Reg r;                                              \
1081
                                                                \
1082
    r.B(0) = d->B((base << (SHIFT + 2)) + 0);                   \
1083
    r.B(1) = s->B((base << (SHIFT + 2)) + 0);                   \
1084
    r.B(2) = d->B((base << (SHIFT + 2)) + 1);                   \
1085
    r.B(3) = s->B((base << (SHIFT + 2)) + 1);                   \
1086
    r.B(4) = d->B((base << (SHIFT + 2)) + 2);                   \
1087
    r.B(5) = s->B((base << (SHIFT + 2)) + 2);                   \
1088
    r.B(6) = d->B((base << (SHIFT + 2)) + 3);                   \
1089
    r.B(7) = s->B((base << (SHIFT + 2)) + 3);                   \
1090
XMM_ONLY(                                                       \
1091
    r.B(8) = d->B((base << (SHIFT + 2)) + 4);                   \
1092
    r.B(9) = s->B((base << (SHIFT + 2)) + 4);                   \
1093
    r.B(10) = d->B((base << (SHIFT + 2)) + 5);                  \
1094
    r.B(11) = s->B((base << (SHIFT + 2)) + 5);                  \
1095
    r.B(12) = d->B((base << (SHIFT + 2)) + 6);                  \
1096
    r.B(13) = s->B((base << (SHIFT + 2)) + 6);                  \
1097
    r.B(14) = d->B((base << (SHIFT + 2)) + 7);                  \
1098
    r.B(15) = s->B((base << (SHIFT + 2)) + 7);                  \
1099
)                                                               \
1100
    *d = r;                                                     \
1101
}                                                               \
1102
                                                                \
1103
void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s)   \
1104
{                                                               \
1105
    Reg r;                                              \
1106
                                                                \
1107
    r.W(0) = d->W((base << (SHIFT + 1)) + 0);                   \
1108
    r.W(1) = s->W((base << (SHIFT + 1)) + 0);                   \
1109
    r.W(2) = d->W((base << (SHIFT + 1)) + 1);                   \
1110
    r.W(3) = s->W((base << (SHIFT + 1)) + 1);                   \
1111
XMM_ONLY(                                                       \
1112
    r.W(4) = d->W((base << (SHIFT + 1)) + 2);                   \
1113
    r.W(5) = s->W((base << (SHIFT + 1)) + 2);                   \
1114
    r.W(6) = d->W((base << (SHIFT + 1)) + 3);                   \
1115
    r.W(7) = s->W((base << (SHIFT + 1)) + 3);                   \
1116
)                                                               \
1117
    *d = r;                                                     \
1118
}                                                               \
1119
                                                                \
1120
void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s)   \
1121
{                                                               \
1122
    Reg r;                                              \
1123
                                                                \
1124
    r.L(0) = d->L((base << SHIFT) + 0);                         \
1125
    r.L(1) = s->L((base << SHIFT) + 0);                         \
1126
XMM_ONLY(                                                       \
1127
    r.L(2) = d->L((base << SHIFT) + 1);                         \
1128
    r.L(3) = s->L((base << SHIFT) + 1);                         \
1129
)                                                               \
1130
    *d = r;                                                     \
1131
}                                                               \
1132
                                                                \
1133
XMM_ONLY(                                                       \
1134
void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s)  \
1135
{                                                               \
1136
    Reg r;                                              \
1137
                                                                \
1138
    r.Q(0) = d->Q(base);                                        \
1139
    r.Q(1) = s->Q(base);                                        \
1140
    *d = r;                                                     \
1141
}                                                               \
1142
)
1143

    
1144
UNPCK_OP(l, 0)
1145
UNPCK_OP(h, 1)
1146

    
1147
/* 3DNow! float ops */
1148
#if SHIFT == 0
1149
void helper_pi2fd(MMXReg *d, MMXReg *s)
1150
{
1151
    d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1152
    d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1153
}
1154

    
1155
void helper_pi2fw(MMXReg *d, MMXReg *s)
1156
{
1157
    d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1158
    d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1159
}
1160

    
1161
void helper_pf2id(MMXReg *d, MMXReg *s)
1162
{
1163
    d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1164
    d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1165
}
1166

    
1167
void helper_pf2iw(MMXReg *d, MMXReg *s)
1168
{
1169
    d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status));
1170
    d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status));
1171
}
1172

    
1173
void helper_pfacc(MMXReg *d, MMXReg *s)
1174
{
1175
    MMXReg r;
1176
    r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1177
    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1178
    *d = r;
1179
}
1180

    
1181
void helper_pfadd(MMXReg *d, MMXReg *s)
1182
{
1183
    d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1184
    d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1185
}
1186

    
1187
void helper_pfcmpeq(MMXReg *d, MMXReg *s)
1188
{
1189
    d->MMX_L(0) = float32_eq(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0;
1190
    d->MMX_L(1) = float32_eq(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0;
1191
}
1192

    
1193
void helper_pfcmpge(MMXReg *d, MMXReg *s)
1194
{
1195
    d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
1196
    d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
1197
}
1198

    
1199
void helper_pfcmpgt(MMXReg *d, MMXReg *s)
1200
{
1201
    d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0;
1202
    d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0;
1203
}
1204

    
1205
void helper_pfmax(MMXReg *d, MMXReg *s)
1206
{
1207
    if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status))
1208
        d->MMX_S(0) = s->MMX_S(0);
1209
    if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status))
1210
        d->MMX_S(1) = s->MMX_S(1);
1211
}
1212

    
1213
void helper_pfmin(MMXReg *d, MMXReg *s)
1214
{
1215
    if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status))
1216
        d->MMX_S(0) = s->MMX_S(0);
1217
    if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status))
1218
        d->MMX_S(1) = s->MMX_S(1);
1219
}
1220

    
1221
void helper_pfmul(MMXReg *d, MMXReg *s)
1222
{
1223
    d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1224
    d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1225
}
1226

    
1227
void helper_pfnacc(MMXReg *d, MMXReg *s)
1228
{
1229
    MMXReg r;
1230
    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1231
    r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1232
    *d = r;
1233
}
1234

    
1235
void helper_pfpnacc(MMXReg *d, MMXReg *s)
1236
{
1237
    MMXReg r;
1238
    r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1239
    r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1240
    *d = r;
1241
}
1242

    
1243
void helper_pfrcp(MMXReg *d, MMXReg *s)
1244
{
1245
    d->MMX_S(0) = approx_rcp(s->MMX_S(0));
1246
    d->MMX_S(1) = d->MMX_S(0);
1247
}
1248

    
1249
void helper_pfrsqrt(MMXReg *d, MMXReg *s)
1250
{
1251
    d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
1252
    d->MMX_S(1) = approx_rsqrt(d->MMX_S(1));
1253
    d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1254
    d->MMX_L(0) = d->MMX_L(1);
1255
}
1256

    
1257
void helper_pfsub(MMXReg *d, MMXReg *s)
1258
{
1259
    d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1260
    d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1261
}
1262

    
1263
void helper_pfsubr(MMXReg *d, MMXReg *s)
1264
{
1265
    d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1266
    d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1267
}
1268

    
1269
void helper_pswapd(MMXReg *d, MMXReg *s)
1270
{
1271
    MMXReg r;
1272
    r.MMX_L(0) = s->MMX_L(1);
1273
    r.MMX_L(1) = s->MMX_L(0);
1274
    *d = r;
1275
}
1276
#endif
1277

    
1278
#undef SHIFT
1279
#undef XMM_ONLY
1280
#undef Reg
1281
#undef B
1282
#undef W
1283
#undef L
1284
#undef Q
1285
#undef SUFFIX