Statistics
| Branch: | Revision:

root / target-arm / op_neon.h @ 6ddbc6e4

History | View | Annotate | Download (35.5 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GPL.
8
 */
9
/* Note that for NEON an "l" prefix means it is a wide operation, unlike
10
   scalar arm ops where it means a word size operation.  */
11

    
12
#define SIGNBIT (uint32_t)0x80000000
13
/* ??? NEON ops should probably have their own float status.  */
14
#define NFS &env->vfp.fp_status
15
#define NEON_OP(name) void OPPROTO op_neon_##name (void)
16

    
17
NEON_OP(getreg_T0)
18
{
19
    T0 = *(uint32_t *)((char *) env + PARAM1);
20
}
21

    
22
NEON_OP(getreg_T1)
23
{
24
    T1 = *(uint32_t *)((char *) env + PARAM1);
25
}
26

    
27
NEON_OP(getreg_T2)
28
{
29
    T2 = *(uint32_t *)((char *) env + PARAM1);
30
}
31

    
32
NEON_OP(setreg_T0)
33
{
34
    *(uint32_t *)((char *) env + PARAM1) = T0;
35
}
36

    
37
NEON_OP(setreg_T1)
38
{
39
    *(uint32_t *)((char *) env + PARAM1) = T1;
40
}
41

    
42
NEON_OP(setreg_T2)
43
{
44
    *(uint32_t *)((char *) env + PARAM1) = T2;
45
}
46

    
47
#define NEON_TYPE1(name, type) \
48
typedef struct \
49
{ \
50
    type v1; \
51
} neon_##name;
52
#ifdef WORDS_BIGENDIAN
53
#define NEON_TYPE2(name, type) \
54
typedef struct \
55
{ \
56
    type v2; \
57
    type v1; \
58
} neon_##name;
59
#define NEON_TYPE4(name, type) \
60
typedef struct \
61
{ \
62
    type v4; \
63
    type v3; \
64
    type v2; \
65
    type v1; \
66
} neon_##name;
67
#else
68
#define NEON_TYPE2(name, type) \
69
typedef struct \
70
{ \
71
    type v1; \
72
    type v2; \
73
} neon_##name;
74
#define NEON_TYPE4(name, type) \
75
typedef struct \
76
{ \
77
    type v1; \
78
    type v2; \
79
    type v3; \
80
    type v4; \
81
} neon_##name;
82
#endif
83

    
84
NEON_TYPE4(s8, int8_t)
85
NEON_TYPE4(u8, uint8_t)
86
NEON_TYPE2(s16, int16_t)
87
NEON_TYPE2(u16, uint16_t)
88
NEON_TYPE1(s32, int32_t)
89
NEON_TYPE1(u32, uint32_t)
90
#undef NEON_TYPE4
91
#undef NEON_TYPE2
92
#undef NEON_TYPE1
93

    
94
/* Copy from a uint32_t to a vector structure type.  */
95
#define NEON_UNPACK(vtype, dest, val) do { \
96
    union { \
97
        vtype v; \
98
        uint32_t i; \
99
    } conv_u; \
100
    conv_u.i = (val); \
101
    dest = conv_u.v; \
102
    } while(0)
103

    
104
/* Copy from a vector structure type to a uint32_t.  */
105
#define NEON_PACK(vtype, dest, val) do { \
106
    union { \
107
        vtype v; \
108
        uint32_t i; \
109
    } conv_u; \
110
    conv_u.v = (val); \
111
    dest = conv_u.i; \
112
    } while(0)
113

    
114
#define NEON_DO1 \
115
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
116
#define NEON_DO2 \
117
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
118
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
119
#define NEON_DO4 \
120
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
121
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
122
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
123
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
124

    
125
#define NEON_VOP(name, vtype, n) \
126
NEON_OP(name) \
127
{ \
128
    vtype vsrc1; \
129
    vtype vsrc2; \
130
    vtype vdest; \
131
    NEON_UNPACK(vtype, vsrc1, T0); \
132
    NEON_UNPACK(vtype, vsrc2, T1); \
133
    NEON_DO##n; \
134
    NEON_PACK(vtype, T0, vdest); \
135
    FORCE_RET(); \
136
}
137

    
138
#define NEON_VOP1(name, vtype, n) \
139
NEON_OP(name) \
140
{ \
141
    vtype vsrc1; \
142
    vtype vdest; \
143
    NEON_UNPACK(vtype, vsrc1, T0); \
144
    NEON_DO##n; \
145
    NEON_PACK(vtype, T0, vdest); \
146
    FORCE_RET(); \
147
}
148

    
149
/* Pairwise operations.  */
150
/* For 32-bit elements each segment only contains a single element, so
151
   the elementwise and pairwise operations are the same.  */
152
#define NEON_PDO2 \
153
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
154
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
155
#define NEON_PDO4 \
156
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
157
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
158
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
159
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
160

    
161
#define NEON_POP(name, vtype, n) \
162
NEON_OP(name) \
163
{ \
164
    vtype vsrc1; \
165
    vtype vsrc2; \
166
    vtype vdest; \
167
    NEON_UNPACK(vtype, vsrc1, T0); \
168
    NEON_UNPACK(vtype, vsrc2, T1); \
169
    NEON_PDO##n; \
170
    NEON_PACK(vtype, T0, vdest); \
171
    FORCE_RET(); \
172
}
173

    
174
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
175
NEON_VOP(hadd_s8, neon_s8, 4)
176
NEON_VOP(hadd_u8, neon_u8, 4)
177
NEON_VOP(hadd_s16, neon_s16, 2)
178
NEON_VOP(hadd_u16, neon_u16, 2)
179
#undef NEON_FN
180

    
181
NEON_OP(hadd_s32)
182
{
183
    int32_t src1 = T0;
184
    int32_t src2 = T1;
185
    int32_t dest;
186

    
187
    dest = (src1 >> 1) + (src2 >> 1);
188
    if (src1 & src2 & 1)
189
        dest++;
190
    T0 = dest;
191
    FORCE_RET();
192
}
193

    
194
NEON_OP(hadd_u32)
195
{
196
    uint32_t src1 = T0;
197
    uint32_t src2 = T1;
198
    uint32_t dest;
199

    
200
    dest = (src1 >> 1) + (src2 >> 1);
201
    if (src1 & src2 & 1)
202
        dest++;
203
    T0 = dest;
204
    FORCE_RET();
205
}
206

    
207
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
208
NEON_VOP(rhadd_s8, neon_s8, 4)
209
NEON_VOP(rhadd_u8, neon_u8, 4)
210
NEON_VOP(rhadd_s16, neon_s16, 2)
211
NEON_VOP(rhadd_u16, neon_u16, 2)
212
#undef NEON_FN
213

    
214
NEON_OP(rhadd_s32)
215
{
216
    int32_t src1 = T0;
217
    int32_t src2 = T1;
218
    int32_t dest;
219

    
220
    dest = (src1 >> 1) + (src2 >> 1);
221
    if ((src1 | src2) & 1)
222
        dest++;
223
    T0 = dest;
224
    FORCE_RET();
225
}
226

    
227
NEON_OP(rhadd_u32)
228
{
229
    uint32_t src1 = T0;
230
    uint32_t src2 = T1;
231
    uint32_t dest;
232

    
233
    dest = (src1 >> 1) + (src2 >> 1);
234
    if ((src1 | src2) & 1)
235
        dest++;
236
    T0 = dest;
237
    FORCE_RET();
238
}
239

    
240
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
241
NEON_VOP(hsub_s8, neon_s8, 4)
242
NEON_VOP(hsub_u8, neon_u8, 4)
243
NEON_VOP(hsub_s16, neon_s16, 2)
244
NEON_VOP(hsub_u16, neon_u16, 2)
245
#undef NEON_FN
246

    
247
NEON_OP(hsub_s32)
248
{
249
    int32_t src1 = T0;
250
    int32_t src2 = T1;
251
    int32_t dest;
252

    
253
    dest = (src1 >> 1) - (src2 >> 1);
254
    if ((~src1) & src2 & 1)
255
        dest--;
256
    T0 = dest;
257
    FORCE_RET();
258
}
259

    
260
NEON_OP(hsub_u32)
261
{
262
    uint32_t src1 = T0;
263
    uint32_t src2 = T1;
264
    uint32_t dest;
265

    
266
    dest = (src1 >> 1) - (src2 >> 1);
267
    if ((~src1) & src2 & 1)
268
        dest--;
269
    T0 = dest;
270
    FORCE_RET();
271
}
272

    
273
/* ??? bsl, bif and bit are all the same op, just with the oparands in a
274
   differnet order.  It's currently easier to have 3 differnt ops than
275
   rearange the operands.  */
276

    
277
/* Bitwise Select.  */
278
NEON_OP(bsl)
279
{
280
    T0 = (T0 & T2) | (T1 & ~T2);
281
}
282

    
283
/* Bitwise Insert If True.  */
284
NEON_OP(bit)
285
{
286
    T0 = (T0 & T1) | (T2 & ~T1);
287
}
288

    
289
/* Bitwise Insert If False.  */
290
NEON_OP(bif)
291
{
292
    T0 = (T2 & T1) | (T0 & ~T1);
293
}
294

    
295
#define NEON_USAT(dest, src1, src2, type) do { \
296
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
297
    if (tmp != (type)tmp) { \
298
        env->QF = 1; \
299
        dest = ~0; \
300
    } else { \
301
        dest = tmp; \
302
    }} while(0)
303
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
304
NEON_VOP(qadd_u8, neon_u8, 4)
305
#undef NEON_FN
306
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
307
NEON_VOP(qadd_u16, neon_u16, 2)
308
#undef NEON_FN
309
#undef NEON_USAT
310

    
311
#define NEON_SSAT(dest, src1, src2, type) do { \
312
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
313
    if (tmp != (type)tmp) { \
314
        env->QF = 1; \
315
        if (src2 > 0) { \
316
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
317
        } else { \
318
            tmp = 1 << (sizeof(type) * 8 - 1); \
319
        } \
320
    } \
321
    dest = tmp; \
322
    } while(0)
323
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
324
NEON_VOP(qadd_s8, neon_s8, 4)
325
#undef NEON_FN
326
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
327
NEON_VOP(qadd_s16, neon_s16, 2)
328
#undef NEON_FN
329
#undef NEON_SSAT
330

    
331
#define NEON_USAT(dest, src1, src2, type) do { \
332
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
333
    if (tmp != (type)tmp) { \
334
        env->QF = 1; \
335
        dest = 0; \
336
    } else { \
337
        dest = tmp; \
338
    }} while(0)
339
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
340
NEON_VOP(qsub_u8, neon_u8, 4)
341
#undef NEON_FN
342
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
343
NEON_VOP(qsub_u16, neon_u16, 2)
344
#undef NEON_FN
345
#undef NEON_USAT
346

    
347
#define NEON_SSAT(dest, src1, src2, type) do { \
348
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
349
    if (tmp != (type)tmp) { \
350
        env->QF = 1; \
351
        if (src2 < 0) { \
352
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
353
        } else { \
354
            tmp = 1 << (sizeof(type) * 8 - 1); \
355
        } \
356
    } \
357
    dest = tmp; \
358
    } while(0)
359
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
360
NEON_VOP(qsub_s8, neon_s8, 4)
361
#undef NEON_FN
362
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
363
NEON_VOP(qsub_s16, neon_s16, 2)
364
#undef NEON_FN
365
#undef NEON_SSAT
366

    
367
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
368
NEON_VOP(cgt_s8, neon_s8, 4)
369
NEON_VOP(cgt_u8, neon_u8, 4)
370
NEON_VOP(cgt_s16, neon_s16, 2)
371
NEON_VOP(cgt_u16, neon_u16, 2)
372
NEON_VOP(cgt_s32, neon_s32, 1)
373
NEON_VOP(cgt_u32, neon_u32, 1)
374
#undef NEON_FN
375

    
376
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
377
NEON_VOP(cge_s8, neon_s8, 4)
378
NEON_VOP(cge_u8, neon_u8, 4)
379
NEON_VOP(cge_s16, neon_s16, 2)
380
NEON_VOP(cge_u16, neon_u16, 2)
381
NEON_VOP(cge_s32, neon_s32, 1)
382
NEON_VOP(cge_u32, neon_u32, 1)
383
#undef NEON_FN
384

    
385
#define NEON_FN(dest, src1, src2) do { \
386
    int8_t tmp; \
387
    tmp = (int8_t)src2; \
388
    if (tmp < 0) { \
389
        dest = src1 >> -tmp; \
390
    } else { \
391
        dest = src1 << tmp; \
392
    }} while (0)
393
NEON_VOP(shl_s8, neon_s8, 4)
394
NEON_VOP(shl_u8, neon_u8, 4)
395
NEON_VOP(shl_s16, neon_s16, 2)
396
NEON_VOP(shl_u16, neon_u16, 2)
397
NEON_VOP(shl_s32, neon_s32, 1)
398
NEON_VOP(shl_u32, neon_u32, 1)
399
#undef NEON_FN
400

    
401
NEON_OP(shl_u64)
402
{
403
    int8_t shift = T2;
404
    uint64_t val = T0 | ((uint64_t)T1 << 32);
405
    if (shift < 0) {
406
        val >>= -shift;
407
    } else {
408
        val <<= shift;
409
    }
410
    T0 = val;
411
    T1 = val >> 32;
412
    FORCE_RET();
413
}
414

    
415
NEON_OP(shl_s64)
416
{
417
    int8_t shift = T2;
418
    int64_t val = T0 | ((uint64_t)T1 << 32);
419
    if (shift < 0) {
420
        val >>= -shift;
421
    } else {
422
        val <<= shift;
423
    }
424
    T0 = val;
425
    T1 = val >> 32;
426
    FORCE_RET();
427
}
428

    
429
#define NEON_FN(dest, src1, src2) do { \
430
    int8_t tmp; \
431
    tmp = (int8_t)src1; \
432
    if (tmp < 0) { \
433
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
434
    } else { \
435
        dest = src2 << tmp; \
436
    }} while (0)
437

    
438
NEON_VOP(rshl_s8, neon_s8, 4)
439
NEON_VOP(rshl_u8, neon_u8, 4)
440
NEON_VOP(rshl_s16, neon_s16, 2)
441
NEON_VOP(rshl_u16, neon_u16, 2)
442
NEON_VOP(rshl_s32, neon_s32, 1)
443
NEON_VOP(rshl_u32, neon_u32, 1)
444
#undef NEON_FN
445

    
446
NEON_OP(rshl_u64)
447
{
448
    int8_t shift = T2;
449
    uint64_t val = T0 | ((uint64_t)T1 << 32);
450
    if (shift < 0) {
451
        val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
452
        val >>= -shift;
453
    } else {
454
        val <<= shift;
455
    }
456
    T0 = val;
457
    T1 = val >> 32;
458
    FORCE_RET();
459
}
460

    
461
NEON_OP(rshl_s64)
462
{
463
    int8_t shift = T2;
464
    int64_t val = T0 | ((uint64_t)T1 << 32);
465
    if (shift < 0) {
466
        val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
467
    } else {
468
        val <<= shift;
469
    }
470
    T0 = val;
471
    T1 = val >> 32;
472
    FORCE_RET();
473
}
474

    
475
#define NEON_FN(dest, src1, src2) do { \
476
    int8_t tmp; \
477
    tmp = (int8_t)src1; \
478
    if (tmp < 0) { \
479
        dest = src2 >> -tmp; \
480
    } else { \
481
        dest = src2 << tmp; \
482
        if ((dest >> tmp) != src2) { \
483
            env->QF = 1; \
484
            dest = ~0; \
485
        } \
486
    }} while (0)
487
NEON_VOP(qshl_s8, neon_s8, 4)
488
NEON_VOP(qshl_s16, neon_s16, 2)
489
NEON_VOP(qshl_s32, neon_s32, 1)
490
#undef NEON_FN
491

    
492
NEON_OP(qshl_s64)
493
{
494
    int8_t shift = T2;
495
    int64_t val = T0 | ((uint64_t)T1 << 32);
496
    if (shift < 0) {
497
        val >>= -shift;
498
    } else {
499
        int64_t tmp = val;
500
        val <<= shift;
501
        if ((val >> shift) != tmp) {
502
            env->QF = 1;
503
            val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
504
        }
505
    }
506
    T0 = val;
507
    T1 = val >> 32;
508
    FORCE_RET();
509
}
510

    
511
#define NEON_FN(dest, src1, src2) do { \
512
    int8_t tmp; \
513
    tmp = (int8_t)src1; \
514
    if (tmp < 0) { \
515
        dest = src2 >> -tmp; \
516
    } else { \
517
        dest = src2 << tmp; \
518
        if ((dest >> tmp) != src2) { \
519
            env->QF = 1; \
520
            dest = src2 >> 31; \
521
        } \
522
    }} while (0)
523
NEON_VOP(qshl_u8, neon_u8, 4)
524
NEON_VOP(qshl_u16, neon_u16, 2)
525
NEON_VOP(qshl_u32, neon_u32, 1)
526
#undef NEON_FN
527

    
528
NEON_OP(qshl_u64)
529
{
530
    int8_t shift = T2;
531
    uint64_t val = T0 | ((uint64_t)T1 << 32);
532
    if (shift < 0) {
533
        val >>= -shift;
534
    } else {
535
        uint64_t tmp = val;
536
        val <<= shift;
537
        if ((val >> shift) != tmp) {
538
            env->QF = 1;
539
            val = ~(uint64_t)0;
540
        }
541
    }
542
    T0 = val;
543
    T1 = val >> 32;
544
    FORCE_RET();
545
}
546

    
547
#define NEON_FN(dest, src1, src2) do { \
548
    int8_t tmp; \
549
    tmp = (int8_t)src1; \
550
    if (tmp < 0) { \
551
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
552
    } else { \
553
        dest = src2 << tmp; \
554
        if ((dest >> tmp) != src2) { \
555
            dest = ~0; \
556
        } \
557
    }} while (0)
558
NEON_VOP(qrshl_s8, neon_s8, 4)
559
NEON_VOP(qrshl_s16, neon_s16, 2)
560
NEON_VOP(qrshl_s32, neon_s32, 1)
561
#undef NEON_FN
562

    
563
#define NEON_FN(dest, src1, src2) do { \
564
    int8_t tmp; \
565
    tmp = (int8_t)src1; \
566
    if (tmp < 0) { \
567
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
568
    } else { \
569
        dest = src2 << tmp; \
570
        if ((dest >> tmp) != src2) { \
571
            env->QF = 1; \
572
            dest = src2 >> 31; \
573
        } \
574
    }} while (0)
575
NEON_VOP(qrshl_u8, neon_u8, 4)
576
NEON_VOP(qrshl_u16, neon_u16, 2)
577
NEON_VOP(qrshl_u32, neon_u32, 1)
578
#undef NEON_FN
579

    
580
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
581
NEON_VOP(max_s8, neon_s8, 4)
582
NEON_VOP(max_u8, neon_u8, 4)
583
NEON_VOP(max_s16, neon_s16, 2)
584
NEON_VOP(max_u16, neon_u16, 2)
585
NEON_VOP(max_s32, neon_s32, 1)
586
NEON_VOP(max_u32, neon_u32, 1)
587
NEON_POP(pmax_s8, neon_s8, 4)
588
NEON_POP(pmax_u8, neon_u8, 4)
589
NEON_POP(pmax_s16, neon_s16, 2)
590
NEON_POP(pmax_u16, neon_u16, 2)
591
#undef NEON_FN
592

    
593
NEON_OP(max_f32)
594
{
595
    float32 f0 = vfp_itos(T0);
596
    float32 f1 = vfp_itos(T1);
597
    T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
598
    FORCE_RET();
599
}
600

    
601
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
602
NEON_VOP(min_s8, neon_s8, 4)
603
NEON_VOP(min_u8, neon_u8, 4)
604
NEON_VOP(min_s16, neon_s16, 2)
605
NEON_VOP(min_u16, neon_u16, 2)
606
NEON_VOP(min_s32, neon_s32, 1)
607
NEON_VOP(min_u32, neon_u32, 1)
608
NEON_POP(pmin_s8, neon_s8, 4)
609
NEON_POP(pmin_u8, neon_u8, 4)
610
NEON_POP(pmin_s16, neon_s16, 2)
611
NEON_POP(pmin_u16, neon_u16, 2)
612
#undef NEON_FN
613

    
614
NEON_OP(min_f32)
615
{
616
    float32 f0 = vfp_itos(T0);
617
    float32 f1 = vfp_itos(T1);
618
    T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
619
    FORCE_RET();
620
}
621

    
622
#define NEON_FN(dest, src1, src2) \
623
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
624
NEON_VOP(abd_s8, neon_s8, 4)
625
NEON_VOP(abd_u8, neon_u8, 4)
626
NEON_VOP(abd_s16, neon_s16, 2)
627
NEON_VOP(abd_u16, neon_u16, 2)
628
NEON_VOP(abd_s32, neon_s32, 1)
629
NEON_VOP(abd_u32, neon_u32, 1)
630
#undef NEON_FN
631

    
632
NEON_OP(abd_f32)
633
{
634
    float32 f0 = vfp_itos(T0);
635
    float32 f1 = vfp_itos(T1);
636
    T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
637
                  ? float32_sub(f0, f1, NFS)
638
                  : float32_sub(f1, f0, NFS));
639
    FORCE_RET();
640
}
641

    
642
#define NEON_FN(dest, src1, src2) dest = src1 + src2
643
NEON_VOP(add_u8, neon_u8, 4)
644
NEON_VOP(add_u16, neon_u16, 2)
645
NEON_POP(padd_u8, neon_u8, 4)
646
NEON_POP(padd_u16, neon_u16, 2)
647
#undef NEON_FN
648

    
649
NEON_OP(add_f32)
650
{
651
    T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
652
    FORCE_RET();
653
}
654

    
655
#define NEON_FN(dest, src1, src2) dest = src1 - src2
656
NEON_VOP(sub_u8, neon_u8, 4)
657
NEON_VOP(sub_u16, neon_u16, 2)
658
#undef NEON_FN
659

    
660
NEON_OP(sub_f32)
661
{
662
    T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
663
    FORCE_RET();
664
}
665

    
666
#define NEON_FN(dest, src1, src2) dest = src2 - src1
667
NEON_VOP(rsb_u8, neon_u8, 4)
668
NEON_VOP(rsb_u16, neon_u16, 2)
669
#undef NEON_FN
670

    
671
NEON_OP(rsb_f32)
672
{
673
    T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
674
    FORCE_RET();
675
}
676

    
677
#define NEON_FN(dest, src1, src2) dest = src1 * src2
678
NEON_VOP(mul_u8, neon_u8, 4)
679
NEON_VOP(mul_u16, neon_u16, 2)
680
#undef NEON_FN
681

    
682
NEON_OP(mul_f32)
683
{
684
    T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
685
    FORCE_RET();
686
}
687

    
688
NEON_OP(mul_p8)
689
{
690
    T0 = helper_neon_mul_p8(T0, T1);
691
}
692

    
693
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
694
NEON_VOP(tst_u8, neon_u8, 4)
695
NEON_VOP(tst_u16, neon_u16, 2)
696
NEON_VOP(tst_u32, neon_u32, 1)
697
#undef NEON_FN
698

    
699
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
700
NEON_VOP(ceq_u8, neon_u8, 4)
701
NEON_VOP(ceq_u16, neon_u16, 2)
702
NEON_VOP(ceq_u32, neon_u32, 1)
703
#undef NEON_FN
704

    
705
#define NEON_QDMULH16(dest, src1, src2, round) do { \
706
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
707
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
708
        env->QF = 1; \
709
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
710
    } \
711
    tmp <<= 1; \
712
    if (round) { \
713
        int32_t old = tmp; \
714
        tmp += 1 << 15; \
715
        if ((int32_t)tmp < old) { \
716
            env->QF = 1; \
717
            tmp = SIGNBIT - 1; \
718
        } \
719
    } \
720
    dest = tmp >> 16; \
721
    } while(0)
722
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
723
NEON_VOP(qdmulh_s16, neon_s16, 2)
724
#undef NEON_FN
725
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
726
NEON_VOP(qrdmulh_s16, neon_s16, 2)
727
#undef NEON_FN
728
#undef NEON_QDMULH16
729

    
730
#define SIGNBIT64 ((uint64_t)1 << 63)
731
#define NEON_QDMULH32(dest, src1, src2, round) do { \
732
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
733
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
734
        env->QF = 1; \
735
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
736
    } else { \
737
        tmp <<= 1; \
738
    } \
739
    if (round) { \
740
        int64_t old = tmp; \
741
        tmp += (int64_t)1 << 31; \
742
        if ((int64_t)tmp < old) { \
743
            env->QF = 1; \
744
            tmp = SIGNBIT64 - 1; \
745
        } \
746
    } \
747
    dest = tmp >> 32; \
748
    } while(0)
749
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
750
NEON_VOP(qdmulh_s32, neon_s32, 1)
751
#undef NEON_FN
752
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
753
NEON_VOP(qrdmulh_s32, neon_s32, 1)
754
#undef NEON_FN
755
#undef NEON_QDMULH32
756

    
757
NEON_OP(recps_f32)
758
{
759
    T0 = vfp_stoi(helper_recps_f32(vfp_itos(T0), vfp_itos(T1)));
760
    FORCE_RET();
761
}
762

    
763
NEON_OP(rsqrts_f32)
764
{
765
    T0 = vfp_stoi(helper_rsqrts_f32(vfp_itos(T0), vfp_itos(T1)));
766
    FORCE_RET();
767
}
768

    
769
/* Floating point comparisons produce an integer result.  */
770
#define NEON_VOP_FCMP(name, cmp) \
771
NEON_OP(name) \
772
{ \
773
    if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
774
        T0 = -1; \
775
    else \
776
        T0 = 0; \
777
    FORCE_RET(); \
778
}
779

    
780
NEON_VOP_FCMP(ceq_f32, ==)
781
NEON_VOP_FCMP(cge_f32, >=)
782
NEON_VOP_FCMP(cgt_f32, >)
783

    
784
NEON_OP(acge_f32)
785
{
786
    float32 f0 = float32_abs(vfp_itos(T0));
787
    float32 f1 = float32_abs(vfp_itos(T1));
788
    T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
789
    FORCE_RET();
790
}
791

    
792
NEON_OP(acgt_f32)
793
{
794
    float32 f0 = float32_abs(vfp_itos(T0));
795
    float32 f1 = float32_abs(vfp_itos(T1));
796
    T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
797
    FORCE_RET();
798
}
799

    
800
/* Narrowing instructions.  The named type is the destination type.  */
801
NEON_OP(narrow_u8)
802
{
803
    T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
804
         | ((T1 << 16) & 0xff0000) | (T1 << 24);
805
    FORCE_RET();
806
}
807

    
808
NEON_OP(narrow_sat_u8)
809
{
810
    neon_u16 src;
811
    neon_u8 dest;
812
#define SAT8(d, s) \
813
    if (s > 0xff) { \
814
        d = 0xff; \
815
        env->QF = 1; \
816
    } else  { \
817
        d = s; \
818
    }
819

    
820
    NEON_UNPACK(neon_u16, src, T0);
821
    SAT8(dest.v1, src.v1);
822
    SAT8(dest.v2, src.v2);
823
    NEON_UNPACK(neon_u16, src, T1);
824
    SAT8(dest.v3, src.v1);
825
    SAT8(dest.v4, src.v2);
826
    NEON_PACK(neon_u8, T0, dest);
827
    FORCE_RET();
828
#undef SAT8
829
}
830

    
831
NEON_OP(narrow_sat_s8)
832
{
833
    neon_s16 src;
834
    neon_s8 dest;
835
#define SAT8(d, s) \
836
    if (s != (uint8_t)s) { \
837
        d = (s >> 15) ^ 0x7f; \
838
        env->QF = 1; \
839
    } else  { \
840
        d = s; \
841
    }
842

    
843
    NEON_UNPACK(neon_s16, src, T0);
844
    SAT8(dest.v1, src.v1);
845
    SAT8(dest.v2, src.v2);
846
    NEON_UNPACK(neon_s16, src, T1);
847
    SAT8(dest.v3, src.v1);
848
    SAT8(dest.v4, src.v2);
849
    NEON_PACK(neon_s8, T0, dest);
850
    FORCE_RET();
851
#undef SAT8
852
}
853

    
854
NEON_OP(narrow_u16)
855
{
856
    T0 = (T0 & 0xffff) | (T1 << 16);
857
}
858

    
859
NEON_OP(narrow_sat_u16)
860
{
861
    if (T0 > 0xffff) {
862
        T0 = 0xffff;
863
        env->QF = 1;
864
    }
865
    if (T1 > 0xffff) {
866
        T1 = 0xffff;
867
        env->QF = 1;
868
    }
869
    T0 |= T1 << 16;
870
    FORCE_RET();
871
}
872

    
873
NEON_OP(narrow_sat_s16)
874
{
875
    if ((int32_t)T0 != (int16_t)T0) {
876
        T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
877
        env->QF = 1;
878
    }
879
    if ((int32_t)T1 != (int16_t) T1) {
880
        T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
881
        env->QF = 1;
882
    }
883
    T0 = (uint16_t)T0 | (T1 << 16);
884
    FORCE_RET();
885
}
886

    
887
NEON_OP(narrow_sat_u32)
888
{
889
    if (T1) {
890
        T0 = 0xffffffffu;
891
        env->QF = 1;
892
    }
893
    FORCE_RET();
894
}
895

    
896
NEON_OP(narrow_sat_s32)
897
{
898
    int32_t sign = (int32_t)T1 >> 31;
899

    
900
    if ((int32_t)T1 != sign) {
901
        T0 = sign ^ 0x7fffffff;
902
        env->QF = 1;
903
    }
904
    FORCE_RET();
905
}
906

    
907
/* Narrowing instructions.  Named type is the narrow type.  */
908
NEON_OP(narrow_high_u8)
909
{
910
    T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
911
        | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
912
    FORCE_RET();
913
}
914

    
915
NEON_OP(narrow_high_u16)
916
{
917
    T0 = (T0 >> 16) | (T1 & 0xffff0000);
918
    FORCE_RET();
919
}
920

    
921
NEON_OP(narrow_high_round_u8)
922
{
923
    T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
924
        | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
925
    FORCE_RET();
926
}
927

    
928
NEON_OP(narrow_high_round_u16)
929
{
930
    T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
931
    FORCE_RET();
932
}
933

    
934
NEON_OP(narrow_high_round_u32)
935
{
936
    if (T0 >= 0x80000000u)
937
        T0 = T1 + 1;
938
    else
939
        T0 = T1;
940
    FORCE_RET();
941
}
942

    
943
/* Widening instructions.  Named type is source type.  */
944
NEON_OP(widen_s8)
945
{
946
    uint32_t src;
947

    
948
    src = T0;
949
    T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
950
    T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
951
}
952

    
953
NEON_OP(widen_u8)
954
{
955
    T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
956
    T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
957
}
958

    
959
NEON_OP(widen_s16)
960
{
961
    int32_t src;
962

    
963
    src = T0;
964
    T0 = (int16_t)src;
965
    T1 = src >> 16;
966
}
967

    
968
NEON_OP(widen_u16)
969
{
970
    T1 = T0 >> 16;
971
    T0 &= 0xffff;
972
}
973

    
974
NEON_OP(widen_s32)
975
{
976
    T1 = (int32_t)T0 >> 31;
977
    FORCE_RET();
978
}
979

    
980
NEON_OP(widen_high_u8)
981
{
982
    T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
983
    T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
984
}
985

    
986
NEON_OP(widen_high_u16)
987
{
988
    T1 = T0 & 0xffff0000;
989
    T0 <<= 16;
990
}
991

    
992
/* Long operations.  The type is the wide type.  */
993
NEON_OP(shll_u16)
994
{
995
    int shift = PARAM1;
996
    uint32_t mask;
997

    
998
    mask = 0xffff >> (16 - shift);
999
    mask |= mask << 16;
1000
    mask = ~mask;
1001

    
1002
    T0 = (T0 << shift) & mask;
1003
    T1 = (T1 << shift) & mask;
1004
    FORCE_RET();
1005
}
1006

    
1007
NEON_OP(shll_u64)
1008
{
1009
    int shift = PARAM1;
1010

    
1011
    T1 <<= shift;
1012
    T1 |= T0 >> (32 - shift);
1013
    T0 <<= shift;
1014
    FORCE_RET();
1015
}
1016

    
1017
NEON_OP(addl_u16)
1018
{
1019
    uint32_t tmp;
1020
    uint32_t high;
1021

    
1022
    tmp = env->vfp.scratch[0];
1023
    high = (T0 >> 16) + (tmp >> 16);
1024
    T0 = (uint16_t)(T0 + tmp);
1025
    T0 |= (high << 16);
1026
    tmp = env->vfp.scratch[1];
1027
    high = (T1 >> 16) + (tmp >> 16);
1028
    T1 = (uint16_t)(T1 + tmp);
1029
    T1 |= (high << 16);
1030
    FORCE_RET();
1031
}
1032

    
1033
NEON_OP(addl_u32)
1034
{
1035
    T0 += env->vfp.scratch[0];
1036
    T1 += env->vfp.scratch[1];
1037
    FORCE_RET();
1038
}
1039

    
1040
NEON_OP(addl_u64)
1041
{
1042
    uint64_t tmp;
1043
    tmp = T0 | ((uint64_t)T1 << 32);
1044
    tmp += env->vfp.scratch[0];
1045
    tmp += (uint64_t)env->vfp.scratch[1] << 32;
1046
    T0 = tmp;
1047
    T1 = tmp >> 32;
1048
    FORCE_RET();
1049
}
1050

    
1051
NEON_OP(subl_u16)
1052
{
1053
    uint32_t tmp;
1054
    uint32_t high;
1055

    
1056
    tmp = env->vfp.scratch[0];
1057
    high = (T0 >> 16) - (tmp >> 16);
1058
    T0 = (uint16_t)(T0 - tmp);
1059
    T0 |= (high << 16);
1060
    tmp = env->vfp.scratch[1];
1061
    high = (T1 >> 16) - (tmp >> 16);
1062
    T1 = (uint16_t)(T1 - tmp);
1063
    T1 |= (high << 16);
1064
    FORCE_RET();
1065
}
1066

    
1067
NEON_OP(subl_u32)
1068
{
1069
    T0 -= env->vfp.scratch[0];
1070
    T1 -= env->vfp.scratch[1];
1071
    FORCE_RET();
1072
}
1073

    
1074
NEON_OP(subl_u64)
1075
{
1076
    uint64_t tmp;
1077
    tmp = T0 | ((uint64_t)T1 << 32);
1078
    tmp -= env->vfp.scratch[0];
1079
    tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1080
    T0 = tmp;
1081
    T1 = tmp >> 32;
1082
    FORCE_RET();
1083
}
1084

    
1085
#define DO_ABD(dest, x, y, type) do { \
1086
    type tmp_x = x; \
1087
    type tmp_y = y; \
1088
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1089
    } while(0)
1090

    
1091
NEON_OP(abdl_u16)
1092
{
1093
    uint32_t tmp;
1094
    uint32_t low;
1095
    uint32_t high;
1096

    
1097
    DO_ABD(low, T0, T1, uint8_t);
1098
    DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1099
    low |= tmp << 16;
1100
    DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1101
    DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1102
    high |= tmp << 16;
1103
    T0 = low;
1104
    T1 = high;
1105
    FORCE_RET();
1106
}
1107

    
1108
NEON_OP(abdl_s16)
1109
{
1110
    uint32_t tmp;
1111
    uint32_t low;
1112
    uint32_t high;
1113

    
1114
    DO_ABD(low, T0, T1, int8_t);
1115
    DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1116
    low |= tmp << 16;
1117
    DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1118
    DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1119
    high |= tmp << 16;
1120
    T0 = low;
1121
    T1 = high;
1122
    FORCE_RET();
1123
}
1124

    
1125
NEON_OP(abdl_u32)
1126
{
1127
    uint32_t low;
1128
    uint32_t high;
1129

    
1130
    DO_ABD(low, T0, T1, uint16_t);
1131
    DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1132
    T0 = low;
1133
    T1 = high;
1134
    FORCE_RET();
1135
}
1136

    
1137
NEON_OP(abdl_s32)
1138
{
1139
    uint32_t low;
1140
    uint32_t high;
1141

    
1142
    DO_ABD(low, T0, T1, int16_t);
1143
    DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1144
    T0 = low;
1145
    T1 = high;
1146
    FORCE_RET();
1147
}
1148

    
1149
NEON_OP(abdl_u64)
1150
{
1151
    DO_ABD(T0, T0, T1, uint32_t);
1152
    T1 = 0;
1153
}
1154

    
1155
NEON_OP(abdl_s64)
1156
{
1157
    DO_ABD(T0, T0, T1, int32_t);
1158
    T1 = 0;
1159
}
1160
#undef DO_ABD
1161

    
1162
/* Widening multiple. Named type is the source type.  */
1163
#define DO_MULL(dest, x, y, type1, type2) do { \
1164
    type1 tmp_x = x; \
1165
    type1 tmp_y = y; \
1166
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1167
    } while(0)
1168

    
1169
NEON_OP(mull_u8)
1170
{
1171
    uint32_t tmp;
1172
    uint32_t low;
1173
    uint32_t high;
1174

    
1175
    DO_MULL(low, T0, T1, uint8_t, uint16_t);
1176
    DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1177
    low |= tmp << 16;
1178
    DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1179
    DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1180
    high |= tmp << 16;
1181
    T0 = low;
1182
    T1 = high;
1183
    FORCE_RET();
1184
}
1185

    
1186
NEON_OP(mull_s8)
1187
{
1188
    uint32_t tmp;
1189
    uint32_t low;
1190
    uint32_t high;
1191

    
1192
    DO_MULL(low, T0, T1, int8_t, uint16_t);
1193
    DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1194
    low |= tmp << 16;
1195
    DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1196
    DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1197
    high |= tmp << 16;
1198
    T0 = low;
1199
    T1 = high;
1200
    FORCE_RET();
1201
}
1202

    
1203
NEON_OP(mull_u16)
1204
{
1205
    uint32_t low;
1206
    uint32_t high;
1207

    
1208
    DO_MULL(low, T0, T1, uint16_t, uint32_t);
1209
    DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1210
    T0 = low;
1211
    T1 = high;
1212
    FORCE_RET();
1213
}
1214

    
1215
NEON_OP(mull_s16)
1216
{
1217
    uint32_t low;
1218
    uint32_t high;
1219

    
1220
    DO_MULL(low, T0, T1, int16_t, uint32_t);
1221
    DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1222
    T0 = low;
1223
    T1 = high;
1224
    FORCE_RET();
1225
}
1226

    
1227
NEON_OP(addl_saturate_s32)
1228
{
1229
    uint32_t tmp;
1230
    uint32_t res;
1231

    
1232
    tmp = env->vfp.scratch[0];
1233
    res = T0 + tmp;
1234
    if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1235
        env->QF = 1;
1236
        T0 = (T0 >> 31) ^ 0x7fffffff;
1237
    } else {
1238
      T0 = res;
1239
    }
1240
    tmp = env->vfp.scratch[1];
1241
    res = T1 + tmp;
1242
    if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1243
        env->QF = 1;
1244
        T1 = (T1 >> 31) ^ 0x7fffffff;
1245
    } else {
1246
      T1 = res;
1247
    }
1248
    FORCE_RET();
1249
}
1250

    
1251
NEON_OP(addl_saturate_s64)
1252
{
1253
    uint64_t src1;
1254
    uint64_t src2;
1255
    uint64_t res;
1256

    
1257
    src1 = T0 + ((uint64_t)T1 << 32);
1258
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1259
    res = src1 + src2;
1260
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1261
        env->QF = 1;
1262
        T0 = ~(int64_t)src1 >> 63;
1263
        T1 = T0 ^ 0x80000000;
1264
    } else {
1265
      T0 = res;
1266
      T1 = res >> 32;
1267
    }
1268
    FORCE_RET();
1269
}
1270

    
1271
NEON_OP(addl_saturate_u64)
1272
{
1273
    uint64_t src1;
1274
    uint64_t src2;
1275
    uint64_t res;
1276

    
1277
    src1 = T0 + ((uint64_t)T1 << 32);
1278
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1279
    res = src1 + src2;
1280
    if (res < src1) {
1281
        env->QF = 1;
1282
        T0 = 0xffffffff;
1283
        T1 = 0xffffffff;
1284
    } else {
1285
      T0 = res;
1286
      T1 = res >> 32;
1287
    }
1288
    FORCE_RET();
1289
}
1290

    
1291
NEON_OP(subl_saturate_s64)
1292
{
1293
    uint64_t src1;
1294
    uint64_t src2;
1295
    uint64_t res;
1296

    
1297
    src1 = T0 + ((uint64_t)T1 << 32);
1298
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1299
    res = src1 - src2;
1300
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1301
        env->QF = 1;
1302
        T0 = ~(int64_t)src1 >> 63;
1303
        T1 = T0 ^ 0x80000000;
1304
    } else {
1305
      T0 = res;
1306
      T1 = res >> 32;
1307
    }
1308
    FORCE_RET();
1309
}
1310

    
1311
NEON_OP(subl_saturate_u64)
1312
{
1313
    uint64_t src1;
1314
    uint64_t src2;
1315
    uint64_t res;
1316

    
1317
    src1 = T0 + ((uint64_t)T1 << 32);
1318
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1319
    if (src1 < src2) {
1320
        env->QF = 1;
1321
        T0 = 0;
1322
        T1 = 0;
1323
    } else {
1324
      res = src1 - src2;
1325
      T0 = res;
1326
      T1 = res >> 32;
1327
    }
1328
    FORCE_RET();
1329
}
1330

    
1331
NEON_OP(negl_u16)
1332
{
1333
    uint32_t tmp;
1334
    tmp = T0 >> 16;
1335
    tmp = -tmp;
1336
    T0 = (-T0 & 0xffff) | (tmp << 16);
1337
    tmp = T1 >> 16;
1338
    tmp = -tmp;
1339
    T1 = (-T1 & 0xffff) | (tmp << 16);
1340
    FORCE_RET();
1341
}
1342

    
1343
NEON_OP(negl_u32)
1344
{
1345
    T0 = -T0;
1346
    T1 = -T1;
1347
    FORCE_RET();
1348
}
1349

    
1350
NEON_OP(negl_u64)
1351
{
1352
    uint64_t val;
1353

    
1354
    val = T0 | ((uint64_t)T1 << 32);
1355
    val = -val;
1356
    T0 = val;
1357
    T1 = val >> 32;
1358
    FORCE_RET();
1359
}
1360

    
1361
/* Scalar operations.  */
1362
NEON_OP(dup_low16)
1363
{
1364
    T0 = (T0 & 0xffff) | (T0 << 16);
1365
    FORCE_RET();
1366
}
1367

    
1368
NEON_OP(dup_high16)
1369
{
1370
    T0 = (T0 >> 16) | (T0 & 0xffff0000);
1371
    FORCE_RET();
1372
}
1373

    
1374
/* Helper for VEXT */
1375
NEON_OP(extract)
1376
{
1377
    int shift = PARAM1;
1378
    T0 = (T0 >> shift) | (T1 << (32 - shift));
1379
    FORCE_RET();
1380
}
1381

    
1382
/* Pairwise add long.  Named type is source type.  */
1383
NEON_OP(paddl_s8)
1384
{
1385
    int8_t src1;
1386
    int8_t src2;
1387
    uint16_t result;
1388
    src1 = T0 >> 24;
1389
    src2 = T0 >> 16;
1390
    result = (uint16_t)src1 + src2;
1391
    src1 = T0 >> 8;
1392
    src2 = T0;
1393
    T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1394
    FORCE_RET();
1395
}
1396

    
1397
NEON_OP(paddl_u8)
1398
{
1399
    uint8_t src1;
1400
    uint8_t src2;
1401
    uint16_t result;
1402
    src1 = T0 >> 24;
1403
    src2 = T0 >> 16;
1404
    result = (uint16_t)src1 + src2;
1405
    src1 = T0 >> 8;
1406
    src2 = T0;
1407
    T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1408
    FORCE_RET();
1409
}
1410

    
1411
NEON_OP(paddl_s16)
1412
{
1413
    T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1414
    FORCE_RET();
1415
}
1416

    
1417
NEON_OP(paddl_u16)
1418
{
1419
    T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1420
    FORCE_RET();
1421
}
1422

    
1423
NEON_OP(paddl_s32)
1424
{
1425
    int64_t tmp;
1426
    tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1427
    T0 = tmp;
1428
    T1 = tmp >> 32;
1429
    FORCE_RET();
1430
}
1431

    
1432
NEON_OP(paddl_u32)
1433
{
1434
    uint64_t tmp;
1435
    tmp = (uint64_t)T0 + (uint64_t)T1;
1436
    T0 = tmp;
1437
    T1 = tmp >> 32;
1438
    FORCE_RET();
1439
}
1440

    
1441
/* Count Leading Sign/Zero Bits.  */
1442
static inline int do_clz8(uint8_t x)
1443
{
1444
    int n;
1445
    for (n = 8; x; n--)
1446
        x >>= 1;
1447
    return n;
1448
}
1449

    
1450
static inline int do_clz16(uint16_t x)
1451
{
1452
    int n;
1453
    for (n = 16; x; n--)
1454
        x >>= 1;
1455
    return n;
1456
}
1457

    
1458
NEON_OP(clz_u8)
1459
{
1460
    uint32_t result;
1461
    uint32_t tmp;
1462

    
1463
    tmp = T0;
1464
    result = do_clz8(tmp);
1465
    result |= do_clz8(tmp >> 8) << 8;
1466
    result |= do_clz8(tmp >> 16) << 16;
1467
    result |= do_clz8(tmp >> 24) << 24;
1468
    T0 = result;
1469
    FORCE_RET();
1470
}
1471

    
1472
NEON_OP(clz_u16)
1473
{
1474
    uint32_t result;
1475
    uint32_t tmp;
1476
    tmp = T0;
1477
    result = do_clz16(tmp);
1478
    result |= do_clz16(tmp >> 16) << 16;
1479
    T0 = result;
1480
    FORCE_RET();
1481
}
1482

    
1483
NEON_OP(cls_s8)
1484
{
1485
    uint32_t result;
1486
    int8_t tmp;
1487
    tmp = T0;
1488
    result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1489
    tmp = T0 >> 8;
1490
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1491
    tmp = T0 >> 16;
1492
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1493
    tmp = T0 >> 24;
1494
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1495
    T0 = result;
1496
    FORCE_RET();
1497
}
1498

    
1499
NEON_OP(cls_s16)
1500
{
1501
    uint32_t result;
1502
    int16_t tmp;
1503
    tmp = T0;
1504
    result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1505
    tmp = T0 >> 16;
1506
    result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1507
    T0 = result;
1508
    FORCE_RET();
1509
}
1510

    
1511
NEON_OP(cls_s32)
1512
{
1513
    int count;
1514
    if ((int32_t)T0 < 0)
1515
        T0 = ~T0;
1516
    for (count = 32; T0 > 0; count--)
1517
        T0 = T0 >> 1;
1518
    T0 = count - 1;
1519
    FORCE_RET();
1520
}
1521

    
1522
/* Bit count.  */
1523
NEON_OP(cnt_u8)
1524
{
1525
    T0 = (T0 & 0x55555555) + ((T0 >>  1) & 0x55555555);
1526
    T0 = (T0 & 0x33333333) + ((T0 >>  2) & 0x33333333);
1527
    T0 = (T0 & 0x0f0f0f0f) + ((T0 >>  4) & 0x0f0f0f0f);
1528
    FORCE_RET();
1529
}
1530

    
1531
/* Saturnating negation.  */
1532
/* ??? Make these use NEON_VOP1 */
1533
#define DO_QABS8(x) do { \
1534
    if (x == (int8_t)0x80) { \
1535
        x = 0x7f; \
1536
        env->QF = 1; \
1537
    } else if (x < 0) { \
1538
        x = -x; \
1539
    }} while (0)
1540
NEON_OP(qabs_s8)
1541
{
1542
    neon_s8 vec;
1543
    NEON_UNPACK(neon_s8, vec, T0);
1544
    DO_QABS8(vec.v1);
1545
    DO_QABS8(vec.v2);
1546
    DO_QABS8(vec.v3);
1547
    DO_QABS8(vec.v4);
1548
    NEON_PACK(neon_s8, T0, vec);
1549
    FORCE_RET();
1550
}
1551
#undef DO_QABS8
1552

    
1553
#define DO_QNEG8(x) do { \
1554
    if (x == (int8_t)0x80) { \
1555
        x = 0x7f; \
1556
        env->QF = 1; \
1557
    } else { \
1558
        x = -x; \
1559
    }} while (0)
1560
NEON_OP(qneg_s8)
1561
{
1562
    neon_s8 vec;
1563
    NEON_UNPACK(neon_s8, vec, T0);
1564
    DO_QNEG8(vec.v1);
1565
    DO_QNEG8(vec.v2);
1566
    DO_QNEG8(vec.v3);
1567
    DO_QNEG8(vec.v4);
1568
    NEON_PACK(neon_s8, T0, vec);
1569
    FORCE_RET();
1570
}
1571
#undef DO_QNEG8
1572

    
1573
#define DO_QABS16(x) do { \
1574
    if (x == (int16_t)0x8000) { \
1575
        x = 0x7fff; \
1576
        env->QF = 1; \
1577
    } else if (x < 0) { \
1578
        x = -x; \
1579
    }} while (0)
1580
NEON_OP(qabs_s16)
1581
{
1582
    neon_s16 vec;
1583
    NEON_UNPACK(neon_s16, vec, T0);
1584
    DO_QABS16(vec.v1);
1585
    DO_QABS16(vec.v2);
1586
    NEON_PACK(neon_s16, T0, vec);
1587
    FORCE_RET();
1588
}
1589
#undef DO_QABS16
1590

    
1591
#define DO_QNEG16(x) do { \
1592
    if (x == (int16_t)0x8000) { \
1593
        x = 0x7fff; \
1594
        env->QF = 1; \
1595
    } else { \
1596
        x = -x; \
1597
    }} while (0)
1598
NEON_OP(qneg_s16)
1599
{
1600
    neon_s16 vec;
1601
    NEON_UNPACK(neon_s16, vec, T0);
1602
    DO_QNEG16(vec.v1);
1603
    DO_QNEG16(vec.v2);
1604
    NEON_PACK(neon_s16, T0, vec);
1605
    FORCE_RET();
1606
}
1607
#undef DO_QNEG16
1608

    
1609
NEON_OP(qabs_s32)
1610
{
1611
    if (T0 == 0x80000000) {
1612
        T0 = 0x7fffffff;
1613
        env->QF = 1;
1614
    } else if ((int32_t)T0 < 0) {
1615
        T0 = -T0;
1616
    }
1617
    FORCE_RET();
1618
}
1619

    
1620
NEON_OP(qneg_s32)
1621
{
1622
    if (T0 == 0x80000000) {
1623
        T0 = 0x7fffffff;
1624
        env->QF = 1;
1625
    } else {
1626
        T0 = -T0;
1627
    }
1628
    FORCE_RET();
1629
}
1630

    
1631
/* Unary opperations */
1632
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1633
NEON_VOP1(abs_s8, neon_s8, 4)
1634
NEON_VOP1(abs_s16, neon_s16, 2)
1635
NEON_OP(abs_s32)
1636
{
1637
    if ((int32_t)T0 < 0)
1638
        T0 = -T0;
1639
    FORCE_RET();
1640
}
1641
#undef NEON_FN
1642

    
1643
/* Transpose.  Argument order is rather strange to avoid special casing
1644
   the tranlation code.
1645
   On input T0 = rm, T1 = rd.  On output T0 = rd, T1 = rm  */
1646
NEON_OP(trn_u8)
1647
{
1648
    uint32_t rd;
1649
    uint32_t rm;
1650
    rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1651
    rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1652
    T0 = rd;
1653
    T1 = rm;
1654
    FORCE_RET();
1655
}
1656

    
1657
NEON_OP(trn_u16)
1658
{
1659
    uint32_t rd;
1660
    uint32_t rm;
1661
    rd = (T0 << 16) | (T1 & 0xffff);
1662
    rm = (T1 >> 16) | (T0 & 0xffff0000);
1663
    T0 = rd;
1664
    T1 = rm;
1665
    FORCE_RET();
1666
}
1667

    
1668
/* Worker routines for zip and unzip.  */
1669
NEON_OP(unzip_u8)
1670
{
1671
    uint32_t rd;
1672
    uint32_t rm;
1673
    rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1674
         | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1675
    rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1676
         | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1677
    T0 = rd;
1678
    T1 = rm;
1679
    FORCE_RET();
1680
}
1681

    
1682
NEON_OP(zip_u8)
1683
{
1684
    uint32_t rd;
1685
    uint32_t rm;
1686
    rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1687
         | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1688
    rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1689
         | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1690
    T0 = rd;
1691
    T1 = rm;
1692
    FORCE_RET();
1693
}
1694

    
1695
NEON_OP(zip_u16)
1696
{
1697
    uint32_t tmp;
1698

    
1699
    tmp = (T0 & 0xffff) | (T1 << 16);
1700
    T1 = (T1 & 0xffff0000) | (T0 >> 16);
1701
    T0 = tmp;
1702
    FORCE_RET();
1703
}
1704

    
1705
/* Reciprocal/root estimate.  */
1706
NEON_OP(recpe_u32)
1707
{
1708
    T0 = helper_recpe_u32(T0);
1709
}
1710

    
1711
NEON_OP(rsqrte_u32)
1712
{
1713
    T0 = helper_rsqrte_u32(T0);
1714
}
1715

    
1716
NEON_OP(recpe_f32)
1717
{
1718
    FT0s = helper_recpe_f32(FT0s);
1719
}
1720

    
1721
NEON_OP(rsqrte_f32)
1722
{
1723
    FT0s = helper_rsqrte_f32(FT0s);
1724
}
1725

    
1726
/* Table lookup.  This accessed the register file directly.  */
1727
NEON_OP(tbl)
1728
{
1729
    helper_neon_tbl(PARAM1, PARAM2);
1730
}
1731

    
1732
NEON_OP(dup_u8)
1733
{
1734
    T0 = (T0 >> PARAM1) & 0xff;
1735
    T0 |= T0 << 8;
1736
    T0 |= T0 << 16;
1737
    FORCE_RET();
1738
}
1739

    
1740
/* Helpers for element load/store.  */
1741
NEON_OP(insert_elt)
1742
{
1743
    int shift = PARAM1;
1744
    uint32_t mask = PARAM2;
1745
    T2 = (T2 & mask) | (T0 << shift);
1746
    FORCE_RET();
1747
}
1748

    
1749
NEON_OP(extract_elt)
1750
{
1751
    int shift = PARAM1;
1752
    uint32_t mask = PARAM2;
1753
    T0 = (T2 & mask) >> shift;
1754
    FORCE_RET();
1755
}