Statistics
| Branch: | Revision:

root / target-arm / op_neon.h @ 4373f3ce

History | View | Annotate | Download (35.4 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GPL.
8
 */
9
/* Note that for NEON an "l" prefix means it is a wide operation, unlike
10
   scalar arm ops where it means a word size operation.  */
11

    
12
#define SIGNBIT (uint32_t)0x80000000
13
/* ??? NEON ops should probably have their own float status.  */
14
#define NFS &env->vfp.fp_status
15
#define NEON_OP(name) void OPPROTO op_neon_##name (void)
16

    
17
/* Helper routines to perform bitwise copies between float and int.  */
18
static inline float32 vfp_itos(uint32_t i)
19
{
20
    union {
21
        uint32_t i;
22
        float32 s;
23
    } v;
24

    
25
    v.i = i;
26
    return v.s;
27
}
28

    
29
static inline uint32_t vfp_stoi(float32 s)
30
{
31
    union {
32
        uint32_t i;
33
        float32 s;
34
    } v;
35

    
36
    v.s = s;
37
    return v.i;
38
}
39

    
40
NEON_OP(getreg_T0)
41
{
42
    T0 = *(uint32_t *)((char *) env + PARAM1);
43
}
44

    
45
NEON_OP(getreg_T1)
46
{
47
    T1 = *(uint32_t *)((char *) env + PARAM1);
48
}
49

    
50
NEON_OP(getreg_T2)
51
{
52
    T2 = *(uint32_t *)((char *) env + PARAM1);
53
}
54

    
55
NEON_OP(setreg_T0)
56
{
57
    *(uint32_t *)((char *) env + PARAM1) = T0;
58
}
59

    
60
NEON_OP(setreg_T1)
61
{
62
    *(uint32_t *)((char *) env + PARAM1) = T1;
63
}
64

    
65
NEON_OP(setreg_T2)
66
{
67
    *(uint32_t *)((char *) env + PARAM1) = T2;
68
}
69

    
70
#define NEON_TYPE1(name, type) \
71
typedef struct \
72
{ \
73
    type v1; \
74
} neon_##name;
75
#ifdef WORDS_BIGENDIAN
76
#define NEON_TYPE2(name, type) \
77
typedef struct \
78
{ \
79
    type v2; \
80
    type v1; \
81
} neon_##name;
82
#define NEON_TYPE4(name, type) \
83
typedef struct \
84
{ \
85
    type v4; \
86
    type v3; \
87
    type v2; \
88
    type v1; \
89
} neon_##name;
90
#else
91
#define NEON_TYPE2(name, type) \
92
typedef struct \
93
{ \
94
    type v1; \
95
    type v2; \
96
} neon_##name;
97
#define NEON_TYPE4(name, type) \
98
typedef struct \
99
{ \
100
    type v1; \
101
    type v2; \
102
    type v3; \
103
    type v4; \
104
} neon_##name;
105
#endif
106

    
107
NEON_TYPE4(s8, int8_t)
108
NEON_TYPE4(u8, uint8_t)
109
NEON_TYPE2(s16, int16_t)
110
NEON_TYPE2(u16, uint16_t)
111
NEON_TYPE1(s32, int32_t)
112
NEON_TYPE1(u32, uint32_t)
113
#undef NEON_TYPE4
114
#undef NEON_TYPE2
115
#undef NEON_TYPE1
116

    
117
/* Copy from a uint32_t to a vector structure type.  */
118
#define NEON_UNPACK(vtype, dest, val) do { \
119
    union { \
120
        vtype v; \
121
        uint32_t i; \
122
    } conv_u; \
123
    conv_u.i = (val); \
124
    dest = conv_u.v; \
125
    } while(0)
126

    
127
/* Copy from a vector structure type to a uint32_t.  */
128
#define NEON_PACK(vtype, dest, val) do { \
129
    union { \
130
        vtype v; \
131
        uint32_t i; \
132
    } conv_u; \
133
    conv_u.v = (val); \
134
    dest = conv_u.i; \
135
    } while(0)
136

    
137
#define NEON_DO1 \
138
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
139
#define NEON_DO2 \
140
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
141
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
142
#define NEON_DO4 \
143
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
144
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
145
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
146
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
147

    
148
#define NEON_VOP(name, vtype, n) \
149
NEON_OP(name) \
150
{ \
151
    vtype vsrc1; \
152
    vtype vsrc2; \
153
    vtype vdest; \
154
    NEON_UNPACK(vtype, vsrc1, T0); \
155
    NEON_UNPACK(vtype, vsrc2, T1); \
156
    NEON_DO##n; \
157
    NEON_PACK(vtype, T0, vdest); \
158
    FORCE_RET(); \
159
}
160

    
161
#define NEON_VOP1(name, vtype, n) \
162
NEON_OP(name) \
163
{ \
164
    vtype vsrc1; \
165
    vtype vdest; \
166
    NEON_UNPACK(vtype, vsrc1, T0); \
167
    NEON_DO##n; \
168
    NEON_PACK(vtype, T0, vdest); \
169
    FORCE_RET(); \
170
}
171

    
172
/* Pairwise operations.  */
173
/* For 32-bit elements each segment only contains a single element, so
174
   the elementwise and pairwise operations are the same.  */
175
#define NEON_PDO2 \
176
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
177
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
178
#define NEON_PDO4 \
179
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
180
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
181
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
182
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
183

    
184
#define NEON_POP(name, vtype, n) \
185
NEON_OP(name) \
186
{ \
187
    vtype vsrc1; \
188
    vtype vsrc2; \
189
    vtype vdest; \
190
    NEON_UNPACK(vtype, vsrc1, T0); \
191
    NEON_UNPACK(vtype, vsrc2, T1); \
192
    NEON_PDO##n; \
193
    NEON_PACK(vtype, T0, vdest); \
194
    FORCE_RET(); \
195
}
196

    
197
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
198
NEON_VOP(hadd_s8, neon_s8, 4)
199
NEON_VOP(hadd_u8, neon_u8, 4)
200
NEON_VOP(hadd_s16, neon_s16, 2)
201
NEON_VOP(hadd_u16, neon_u16, 2)
202
#undef NEON_FN
203

    
204
NEON_OP(hadd_s32)
205
{
206
    int32_t src1 = T0;
207
    int32_t src2 = T1;
208
    int32_t dest;
209

    
210
    dest = (src1 >> 1) + (src2 >> 1);
211
    if (src1 & src2 & 1)
212
        dest++;
213
    T0 = dest;
214
    FORCE_RET();
215
}
216

    
217
NEON_OP(hadd_u32)
218
{
219
    uint32_t src1 = T0;
220
    uint32_t src2 = T1;
221
    uint32_t dest;
222

    
223
    dest = (src1 >> 1) + (src2 >> 1);
224
    if (src1 & src2 & 1)
225
        dest++;
226
    T0 = dest;
227
    FORCE_RET();
228
}
229

    
230
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
231
NEON_VOP(rhadd_s8, neon_s8, 4)
232
NEON_VOP(rhadd_u8, neon_u8, 4)
233
NEON_VOP(rhadd_s16, neon_s16, 2)
234
NEON_VOP(rhadd_u16, neon_u16, 2)
235
#undef NEON_FN
236

    
237
NEON_OP(rhadd_s32)
238
{
239
    int32_t src1 = T0;
240
    int32_t src2 = T1;
241
    int32_t dest;
242

    
243
    dest = (src1 >> 1) + (src2 >> 1);
244
    if ((src1 | src2) & 1)
245
        dest++;
246
    T0 = dest;
247
    FORCE_RET();
248
}
249

    
250
NEON_OP(rhadd_u32)
251
{
252
    uint32_t src1 = T0;
253
    uint32_t src2 = T1;
254
    uint32_t dest;
255

    
256
    dest = (src1 >> 1) + (src2 >> 1);
257
    if ((src1 | src2) & 1)
258
        dest++;
259
    T0 = dest;
260
    FORCE_RET();
261
}
262

    
263
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
264
NEON_VOP(hsub_s8, neon_s8, 4)
265
NEON_VOP(hsub_u8, neon_u8, 4)
266
NEON_VOP(hsub_s16, neon_s16, 2)
267
NEON_VOP(hsub_u16, neon_u16, 2)
268
#undef NEON_FN
269

    
270
NEON_OP(hsub_s32)
271
{
272
    int32_t src1 = T0;
273
    int32_t src2 = T1;
274
    int32_t dest;
275

    
276
    dest = (src1 >> 1) - (src2 >> 1);
277
    if ((~src1) & src2 & 1)
278
        dest--;
279
    T0 = dest;
280
    FORCE_RET();
281
}
282

    
283
NEON_OP(hsub_u32)
284
{
285
    uint32_t src1 = T0;
286
    uint32_t src2 = T1;
287
    uint32_t dest;
288

    
289
    dest = (src1 >> 1) - (src2 >> 1);
290
    if ((~src1) & src2 & 1)
291
        dest--;
292
    T0 = dest;
293
    FORCE_RET();
294
}
295

    
296
/* ??? bsl, bif and bit are all the same op, just with the oparands in a
297
   differnet order.  It's currently easier to have 3 differnt ops than
298
   rearange the operands.  */
299

    
300
/* Bitwise Select.  */
301
NEON_OP(bsl)
302
{
303
    T0 = (T0 & T2) | (T1 & ~T2);
304
}
305

    
306
/* Bitwise Insert If True.  */
307
NEON_OP(bit)
308
{
309
    T0 = (T0 & T1) | (T2 & ~T1);
310
}
311

    
312
/* Bitwise Insert If False.  */
313
NEON_OP(bif)
314
{
315
    T0 = (T2 & T1) | (T0 & ~T1);
316
}
317

    
318
#define NEON_USAT(dest, src1, src2, type) do { \
319
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
320
    if (tmp != (type)tmp) { \
321
        env->QF = 1; \
322
        dest = ~0; \
323
    } else { \
324
        dest = tmp; \
325
    }} while(0)
326
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
327
NEON_VOP(qadd_u8, neon_u8, 4)
328
#undef NEON_FN
329
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
330
NEON_VOP(qadd_u16, neon_u16, 2)
331
#undef NEON_FN
332
#undef NEON_USAT
333

    
334
#define NEON_SSAT(dest, src1, src2, type) do { \
335
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
336
    if (tmp != (type)tmp) { \
337
        env->QF = 1; \
338
        if (src2 > 0) { \
339
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
340
        } else { \
341
            tmp = 1 << (sizeof(type) * 8 - 1); \
342
        } \
343
    } \
344
    dest = tmp; \
345
    } while(0)
346
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
347
NEON_VOP(qadd_s8, neon_s8, 4)
348
#undef NEON_FN
349
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
350
NEON_VOP(qadd_s16, neon_s16, 2)
351
#undef NEON_FN
352
#undef NEON_SSAT
353

    
354
#define NEON_USAT(dest, src1, src2, type) do { \
355
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
356
    if (tmp != (type)tmp) { \
357
        env->QF = 1; \
358
        dest = 0; \
359
    } else { \
360
        dest = tmp; \
361
    }} while(0)
362
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
363
NEON_VOP(qsub_u8, neon_u8, 4)
364
#undef NEON_FN
365
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
366
NEON_VOP(qsub_u16, neon_u16, 2)
367
#undef NEON_FN
368
#undef NEON_USAT
369

    
370
#define NEON_SSAT(dest, src1, src2, type) do { \
371
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
372
    if (tmp != (type)tmp) { \
373
        env->QF = 1; \
374
        if (src2 < 0) { \
375
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
376
        } else { \
377
            tmp = 1 << (sizeof(type) * 8 - 1); \
378
        } \
379
    } \
380
    dest = tmp; \
381
    } while(0)
382
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
383
NEON_VOP(qsub_s8, neon_s8, 4)
384
#undef NEON_FN
385
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
386
NEON_VOP(qsub_s16, neon_s16, 2)
387
#undef NEON_FN
388
#undef NEON_SSAT
389

    
390
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
391
NEON_VOP(cgt_s8, neon_s8, 4)
392
NEON_VOP(cgt_u8, neon_u8, 4)
393
NEON_VOP(cgt_s16, neon_s16, 2)
394
NEON_VOP(cgt_u16, neon_u16, 2)
395
NEON_VOP(cgt_s32, neon_s32, 1)
396
NEON_VOP(cgt_u32, neon_u32, 1)
397
#undef NEON_FN
398

    
399
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
400
NEON_VOP(cge_s8, neon_s8, 4)
401
NEON_VOP(cge_u8, neon_u8, 4)
402
NEON_VOP(cge_s16, neon_s16, 2)
403
NEON_VOP(cge_u16, neon_u16, 2)
404
NEON_VOP(cge_s32, neon_s32, 1)
405
NEON_VOP(cge_u32, neon_u32, 1)
406
#undef NEON_FN
407

    
408
#define NEON_FN(dest, src1, src2) do { \
409
    int8_t tmp; \
410
    tmp = (int8_t)src2; \
411
    if (tmp < 0) { \
412
        dest = src1 >> -tmp; \
413
    } else { \
414
        dest = src1 << tmp; \
415
    }} while (0)
416
NEON_VOP(shl_s8, neon_s8, 4)
417
NEON_VOP(shl_u8, neon_u8, 4)
418
NEON_VOP(shl_s16, neon_s16, 2)
419
NEON_VOP(shl_u16, neon_u16, 2)
420
NEON_VOP(shl_s32, neon_s32, 1)
421
NEON_VOP(shl_u32, neon_u32, 1)
422
#undef NEON_FN
423

    
424
NEON_OP(shl_u64)
425
{
426
    int8_t shift = T2;
427
    uint64_t val = T0 | ((uint64_t)T1 << 32);
428
    if (shift < 0) {
429
        val >>= -shift;
430
    } else {
431
        val <<= shift;
432
    }
433
    T0 = val;
434
    T1 = val >> 32;
435
    FORCE_RET();
436
}
437

    
438
NEON_OP(shl_s64)
439
{
440
    int8_t shift = T2;
441
    int64_t val = T0 | ((uint64_t)T1 << 32);
442
    if (shift < 0) {
443
        val >>= -shift;
444
    } else {
445
        val <<= shift;
446
    }
447
    T0 = val;
448
    T1 = val >> 32;
449
    FORCE_RET();
450
}
451

    
452
#define NEON_FN(dest, src1, src2) do { \
453
    int8_t tmp; \
454
    tmp = (int8_t)src1; \
455
    if (tmp < 0) { \
456
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
457
    } else { \
458
        dest = src2 << tmp; \
459
    }} while (0)
460

    
461
NEON_VOP(rshl_s8, neon_s8, 4)
462
NEON_VOP(rshl_u8, neon_u8, 4)
463
NEON_VOP(rshl_s16, neon_s16, 2)
464
NEON_VOP(rshl_u16, neon_u16, 2)
465
NEON_VOP(rshl_s32, neon_s32, 1)
466
NEON_VOP(rshl_u32, neon_u32, 1)
467
#undef NEON_FN
468

    
469
NEON_OP(rshl_u64)
470
{
471
    int8_t shift = T2;
472
    uint64_t val = T0 | ((uint64_t)T1 << 32);
473
    if (shift < 0) {
474
        val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
475
        val >>= -shift;
476
    } else {
477
        val <<= shift;
478
    }
479
    T0 = val;
480
    T1 = val >> 32;
481
    FORCE_RET();
482
}
483

    
484
NEON_OP(rshl_s64)
485
{
486
    int8_t shift = T2;
487
    int64_t val = T0 | ((uint64_t)T1 << 32);
488
    if (shift < 0) {
489
        val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
490
    } else {
491
        val <<= shift;
492
    }
493
    T0 = val;
494
    T1 = val >> 32;
495
    FORCE_RET();
496
}
497

    
498
#define NEON_FN(dest, src1, src2) do { \
499
    int8_t tmp; \
500
    tmp = (int8_t)src1; \
501
    if (tmp < 0) { \
502
        dest = src2 >> -tmp; \
503
    } else { \
504
        dest = src2 << tmp; \
505
        if ((dest >> tmp) != src2) { \
506
            env->QF = 1; \
507
            dest = ~0; \
508
        } \
509
    }} while (0)
510
NEON_VOP(qshl_s8, neon_s8, 4)
511
NEON_VOP(qshl_s16, neon_s16, 2)
512
NEON_VOP(qshl_s32, neon_s32, 1)
513
#undef NEON_FN
514

    
515
NEON_OP(qshl_s64)
516
{
517
    int8_t shift = T2;
518
    int64_t val = T0 | ((uint64_t)T1 << 32);
519
    if (shift < 0) {
520
        val >>= -shift;
521
    } else {
522
        int64_t tmp = val;
523
        val <<= shift;
524
        if ((val >> shift) != tmp) {
525
            env->QF = 1;
526
            val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
527
        }
528
    }
529
    T0 = val;
530
    T1 = val >> 32;
531
    FORCE_RET();
532
}
533

    
534
#define NEON_FN(dest, src1, src2) do { \
535
    int8_t tmp; \
536
    tmp = (int8_t)src1; \
537
    if (tmp < 0) { \
538
        dest = src2 >> -tmp; \
539
    } else { \
540
        dest = src2 << tmp; \
541
        if ((dest >> tmp) != src2) { \
542
            env->QF = 1; \
543
            dest = src2 >> 31; \
544
        } \
545
    }} while (0)
546
NEON_VOP(qshl_u8, neon_u8, 4)
547
NEON_VOP(qshl_u16, neon_u16, 2)
548
NEON_VOP(qshl_u32, neon_u32, 1)
549
#undef NEON_FN
550

    
551
NEON_OP(qshl_u64)
552
{
553
    int8_t shift = T2;
554
    uint64_t val = T0 | ((uint64_t)T1 << 32);
555
    if (shift < 0) {
556
        val >>= -shift;
557
    } else {
558
        uint64_t tmp = val;
559
        val <<= shift;
560
        if ((val >> shift) != tmp) {
561
            env->QF = 1;
562
            val = ~(uint64_t)0;
563
        }
564
    }
565
    T0 = val;
566
    T1 = val >> 32;
567
    FORCE_RET();
568
}
569

    
570
#define NEON_FN(dest, src1, src2) do { \
571
    int8_t tmp; \
572
    tmp = (int8_t)src1; \
573
    if (tmp < 0) { \
574
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
575
    } else { \
576
        dest = src2 << tmp; \
577
        if ((dest >> tmp) != src2) { \
578
            dest = ~0; \
579
        } \
580
    }} while (0)
581
NEON_VOP(qrshl_s8, neon_s8, 4)
582
NEON_VOP(qrshl_s16, neon_s16, 2)
583
NEON_VOP(qrshl_s32, neon_s32, 1)
584
#undef NEON_FN
585

    
586
#define NEON_FN(dest, src1, src2) do { \
587
    int8_t tmp; \
588
    tmp = (int8_t)src1; \
589
    if (tmp < 0) { \
590
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
591
    } else { \
592
        dest = src2 << tmp; \
593
        if ((dest >> tmp) != src2) { \
594
            env->QF = 1; \
595
            dest = src2 >> 31; \
596
        } \
597
    }} while (0)
598
NEON_VOP(qrshl_u8, neon_u8, 4)
599
NEON_VOP(qrshl_u16, neon_u16, 2)
600
NEON_VOP(qrshl_u32, neon_u32, 1)
601
#undef NEON_FN
602

    
603
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
604
NEON_VOP(max_s8, neon_s8, 4)
605
NEON_VOP(max_u8, neon_u8, 4)
606
NEON_VOP(max_s16, neon_s16, 2)
607
NEON_VOP(max_u16, neon_u16, 2)
608
NEON_VOP(max_s32, neon_s32, 1)
609
NEON_VOP(max_u32, neon_u32, 1)
610
NEON_POP(pmax_s8, neon_s8, 4)
611
NEON_POP(pmax_u8, neon_u8, 4)
612
NEON_POP(pmax_s16, neon_s16, 2)
613
NEON_POP(pmax_u16, neon_u16, 2)
614
#undef NEON_FN
615

    
616
NEON_OP(max_f32)
617
{
618
    float32 f0 = vfp_itos(T0);
619
    float32 f1 = vfp_itos(T1);
620
    T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
621
    FORCE_RET();
622
}
623

    
624
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
625
NEON_VOP(min_s8, neon_s8, 4)
626
NEON_VOP(min_u8, neon_u8, 4)
627
NEON_VOP(min_s16, neon_s16, 2)
628
NEON_VOP(min_u16, neon_u16, 2)
629
NEON_VOP(min_s32, neon_s32, 1)
630
NEON_VOP(min_u32, neon_u32, 1)
631
NEON_POP(pmin_s8, neon_s8, 4)
632
NEON_POP(pmin_u8, neon_u8, 4)
633
NEON_POP(pmin_s16, neon_s16, 2)
634
NEON_POP(pmin_u16, neon_u16, 2)
635
#undef NEON_FN
636

    
637
NEON_OP(min_f32)
638
{
639
    float32 f0 = vfp_itos(T0);
640
    float32 f1 = vfp_itos(T1);
641
    T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
642
    FORCE_RET();
643
}
644

    
645
#define NEON_FN(dest, src1, src2) \
646
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
647
NEON_VOP(abd_s8, neon_s8, 4)
648
NEON_VOP(abd_u8, neon_u8, 4)
649
NEON_VOP(abd_s16, neon_s16, 2)
650
NEON_VOP(abd_u16, neon_u16, 2)
651
NEON_VOP(abd_s32, neon_s32, 1)
652
NEON_VOP(abd_u32, neon_u32, 1)
653
#undef NEON_FN
654

    
655
NEON_OP(abd_f32)
656
{
657
    float32 f0 = vfp_itos(T0);
658
    float32 f1 = vfp_itos(T1);
659
    T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
660
                  ? float32_sub(f0, f1, NFS)
661
                  : float32_sub(f1, f0, NFS));
662
    FORCE_RET();
663
}
664

    
665
#define NEON_FN(dest, src1, src2) dest = src1 + src2
666
NEON_VOP(add_u8, neon_u8, 4)
667
NEON_VOP(add_u16, neon_u16, 2)
668
NEON_POP(padd_u8, neon_u8, 4)
669
NEON_POP(padd_u16, neon_u16, 2)
670
#undef NEON_FN
671

    
672
NEON_OP(add_f32)
673
{
674
    T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
675
    FORCE_RET();
676
}
677

    
678
#define NEON_FN(dest, src1, src2) dest = src1 - src2
679
NEON_VOP(sub_u8, neon_u8, 4)
680
NEON_VOP(sub_u16, neon_u16, 2)
681
#undef NEON_FN
682

    
683
NEON_OP(sub_f32)
684
{
685
    T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
686
    FORCE_RET();
687
}
688

    
689
#define NEON_FN(dest, src1, src2) dest = src2 - src1
690
NEON_VOP(rsb_u8, neon_u8, 4)
691
NEON_VOP(rsb_u16, neon_u16, 2)
692
#undef NEON_FN
693

    
694
NEON_OP(rsb_f32)
695
{
696
    T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
697
    FORCE_RET();
698
}
699

    
700
#define NEON_FN(dest, src1, src2) dest = src1 * src2
701
NEON_VOP(mul_u8, neon_u8, 4)
702
NEON_VOP(mul_u16, neon_u16, 2)
703
#undef NEON_FN
704

    
705
NEON_OP(mul_f32)
706
{
707
    T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
708
    FORCE_RET();
709
}
710

    
711
NEON_OP(mul_p8)
712
{
713
    T0 = helper_neon_mul_p8(T0, T1);
714
}
715

    
716
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
717
NEON_VOP(tst_u8, neon_u8, 4)
718
NEON_VOP(tst_u16, neon_u16, 2)
719
NEON_VOP(tst_u32, neon_u32, 1)
720
#undef NEON_FN
721

    
722
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
723
NEON_VOP(ceq_u8, neon_u8, 4)
724
NEON_VOP(ceq_u16, neon_u16, 2)
725
NEON_VOP(ceq_u32, neon_u32, 1)
726
#undef NEON_FN
727

    
728
#define NEON_QDMULH16(dest, src1, src2, round) do { \
729
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
730
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
731
        env->QF = 1; \
732
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
733
    } \
734
    tmp <<= 1; \
735
    if (round) { \
736
        int32_t old = tmp; \
737
        tmp += 1 << 15; \
738
        if ((int32_t)tmp < old) { \
739
            env->QF = 1; \
740
            tmp = SIGNBIT - 1; \
741
        } \
742
    } \
743
    dest = tmp >> 16; \
744
    } while(0)
745
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
746
NEON_VOP(qdmulh_s16, neon_s16, 2)
747
#undef NEON_FN
748
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
749
NEON_VOP(qrdmulh_s16, neon_s16, 2)
750
#undef NEON_FN
751
#undef NEON_QDMULH16
752

    
753
#define SIGNBIT64 ((uint64_t)1 << 63)
754
#define NEON_QDMULH32(dest, src1, src2, round) do { \
755
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
756
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
757
        env->QF = 1; \
758
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
759
    } else { \
760
        tmp <<= 1; \
761
    } \
762
    if (round) { \
763
        int64_t old = tmp; \
764
        tmp += (int64_t)1 << 31; \
765
        if ((int64_t)tmp < old) { \
766
            env->QF = 1; \
767
            tmp = SIGNBIT64 - 1; \
768
        } \
769
    } \
770
    dest = tmp >> 32; \
771
    } while(0)
772
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
773
NEON_VOP(qdmulh_s32, neon_s32, 1)
774
#undef NEON_FN
775
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
776
NEON_VOP(qrdmulh_s32, neon_s32, 1)
777
#undef NEON_FN
778
#undef NEON_QDMULH32
779

    
780
/* Floating point comparisons produce an integer result.  */
781
#define NEON_VOP_FCMP(name, cmp) \
782
NEON_OP(name) \
783
{ \
784
    if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
785
        T0 = -1; \
786
    else \
787
        T0 = 0; \
788
    FORCE_RET(); \
789
}
790

    
791
NEON_VOP_FCMP(ceq_f32, ==)
792
NEON_VOP_FCMP(cge_f32, >=)
793
NEON_VOP_FCMP(cgt_f32, >)
794

    
795
NEON_OP(acge_f32)
796
{
797
    float32 f0 = float32_abs(vfp_itos(T0));
798
    float32 f1 = float32_abs(vfp_itos(T1));
799
    T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
800
    FORCE_RET();
801
}
802

    
803
NEON_OP(acgt_f32)
804
{
805
    float32 f0 = float32_abs(vfp_itos(T0));
806
    float32 f1 = float32_abs(vfp_itos(T1));
807
    T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
808
    FORCE_RET();
809
}
810

    
811
/* Narrowing instructions.  The named type is the destination type.  */
812
NEON_OP(narrow_u8)
813
{
814
    T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
815
         | ((T1 << 16) & 0xff0000) | (T1 << 24);
816
    FORCE_RET();
817
}
818

    
819
NEON_OP(narrow_sat_u8)
820
{
821
    neon_u16 src;
822
    neon_u8 dest;
823
#define SAT8(d, s) \
824
    if (s > 0xff) { \
825
        d = 0xff; \
826
        env->QF = 1; \
827
    } else  { \
828
        d = s; \
829
    }
830

    
831
    NEON_UNPACK(neon_u16, src, T0);
832
    SAT8(dest.v1, src.v1);
833
    SAT8(dest.v2, src.v2);
834
    NEON_UNPACK(neon_u16, src, T1);
835
    SAT8(dest.v3, src.v1);
836
    SAT8(dest.v4, src.v2);
837
    NEON_PACK(neon_u8, T0, dest);
838
    FORCE_RET();
839
#undef SAT8
840
}
841

    
842
NEON_OP(narrow_sat_s8)
843
{
844
    neon_s16 src;
845
    neon_s8 dest;
846
#define SAT8(d, s) \
847
    if (s != (uint8_t)s) { \
848
        d = (s >> 15) ^ 0x7f; \
849
        env->QF = 1; \
850
    } else  { \
851
        d = s; \
852
    }
853

    
854
    NEON_UNPACK(neon_s16, src, T0);
855
    SAT8(dest.v1, src.v1);
856
    SAT8(dest.v2, src.v2);
857
    NEON_UNPACK(neon_s16, src, T1);
858
    SAT8(dest.v3, src.v1);
859
    SAT8(dest.v4, src.v2);
860
    NEON_PACK(neon_s8, T0, dest);
861
    FORCE_RET();
862
#undef SAT8
863
}
864

    
865
NEON_OP(narrow_u16)
866
{
867
    T0 = (T0 & 0xffff) | (T1 << 16);
868
}
869

    
870
NEON_OP(narrow_sat_u16)
871
{
872
    if (T0 > 0xffff) {
873
        T0 = 0xffff;
874
        env->QF = 1;
875
    }
876
    if (T1 > 0xffff) {
877
        T1 = 0xffff;
878
        env->QF = 1;
879
    }
880
    T0 |= T1 << 16;
881
    FORCE_RET();
882
}
883

    
884
NEON_OP(narrow_sat_s16)
885
{
886
    if ((int32_t)T0 != (int16_t)T0) {
887
        T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
888
        env->QF = 1;
889
    }
890
    if ((int32_t)T1 != (int16_t) T1) {
891
        T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
892
        env->QF = 1;
893
    }
894
    T0 = (uint16_t)T0 | (T1 << 16);
895
    FORCE_RET();
896
}
897

    
898
NEON_OP(narrow_sat_u32)
899
{
900
    if (T1) {
901
        T0 = 0xffffffffu;
902
        env->QF = 1;
903
    }
904
    FORCE_RET();
905
}
906

    
907
NEON_OP(narrow_sat_s32)
908
{
909
    int32_t sign = (int32_t)T1 >> 31;
910

    
911
    if ((int32_t)T1 != sign) {
912
        T0 = sign ^ 0x7fffffff;
913
        env->QF = 1;
914
    }
915
    FORCE_RET();
916
}
917

    
918
/* Narrowing instructions.  Named type is the narrow type.  */
919
NEON_OP(narrow_high_u8)
920
{
921
    T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
922
        | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
923
    FORCE_RET();
924
}
925

    
926
NEON_OP(narrow_high_u16)
927
{
928
    T0 = (T0 >> 16) | (T1 & 0xffff0000);
929
    FORCE_RET();
930
}
931

    
932
NEON_OP(narrow_high_round_u8)
933
{
934
    T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
935
        | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
936
    FORCE_RET();
937
}
938

    
939
NEON_OP(narrow_high_round_u16)
940
{
941
    T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
942
    FORCE_RET();
943
}
944

    
945
NEON_OP(narrow_high_round_u32)
946
{
947
    if (T0 >= 0x80000000u)
948
        T0 = T1 + 1;
949
    else
950
        T0 = T1;
951
    FORCE_RET();
952
}
953

    
954
/* Widening instructions.  Named type is source type.  */
955
NEON_OP(widen_s8)
956
{
957
    uint32_t src;
958

    
959
    src = T0;
960
    T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
961
    T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
962
}
963

    
964
NEON_OP(widen_u8)
965
{
966
    T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
967
    T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
968
}
969

    
970
NEON_OP(widen_s16)
971
{
972
    int32_t src;
973

    
974
    src = T0;
975
    T0 = (int16_t)src;
976
    T1 = src >> 16;
977
}
978

    
979
NEON_OP(widen_u16)
980
{
981
    T1 = T0 >> 16;
982
    T0 &= 0xffff;
983
}
984

    
985
NEON_OP(widen_s32)
986
{
987
    T1 = (int32_t)T0 >> 31;
988
    FORCE_RET();
989
}
990

    
991
NEON_OP(widen_high_u8)
992
{
993
    T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
994
    T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
995
}
996

    
997
NEON_OP(widen_high_u16)
998
{
999
    T1 = T0 & 0xffff0000;
1000
    T0 <<= 16;
1001
}
1002

    
1003
/* Long operations.  The type is the wide type.  */
1004
NEON_OP(shll_u16)
1005
{
1006
    int shift = PARAM1;
1007
    uint32_t mask;
1008

    
1009
    mask = 0xffff >> (16 - shift);
1010
    mask |= mask << 16;
1011
    mask = ~mask;
1012

    
1013
    T0 = (T0 << shift) & mask;
1014
    T1 = (T1 << shift) & mask;
1015
    FORCE_RET();
1016
}
1017

    
1018
NEON_OP(shll_u64)
1019
{
1020
    int shift = PARAM1;
1021

    
1022
    T1 <<= shift;
1023
    T1 |= T0 >> (32 - shift);
1024
    T0 <<= shift;
1025
    FORCE_RET();
1026
}
1027

    
1028
NEON_OP(addl_u16)
1029
{
1030
    uint32_t tmp;
1031
    uint32_t high;
1032

    
1033
    tmp = env->vfp.scratch[0];
1034
    high = (T0 >> 16) + (tmp >> 16);
1035
    T0 = (uint16_t)(T0 + tmp);
1036
    T0 |= (high << 16);
1037
    tmp = env->vfp.scratch[1];
1038
    high = (T1 >> 16) + (tmp >> 16);
1039
    T1 = (uint16_t)(T1 + tmp);
1040
    T1 |= (high << 16);
1041
    FORCE_RET();
1042
}
1043

    
1044
NEON_OP(addl_u32)
1045
{
1046
    T0 += env->vfp.scratch[0];
1047
    T1 += env->vfp.scratch[1];
1048
    FORCE_RET();
1049
}
1050

    
1051
NEON_OP(addl_u64)
1052
{
1053
    uint64_t tmp;
1054
    tmp = T0 | ((uint64_t)T1 << 32);
1055
    tmp += env->vfp.scratch[0];
1056
    tmp += (uint64_t)env->vfp.scratch[1] << 32;
1057
    T0 = tmp;
1058
    T1 = tmp >> 32;
1059
    FORCE_RET();
1060
}
1061

    
1062
NEON_OP(subl_u16)
1063
{
1064
    uint32_t tmp;
1065
    uint32_t high;
1066

    
1067
    tmp = env->vfp.scratch[0];
1068
    high = (T0 >> 16) - (tmp >> 16);
1069
    T0 = (uint16_t)(T0 - tmp);
1070
    T0 |= (high << 16);
1071
    tmp = env->vfp.scratch[1];
1072
    high = (T1 >> 16) - (tmp >> 16);
1073
    T1 = (uint16_t)(T1 - tmp);
1074
    T1 |= (high << 16);
1075
    FORCE_RET();
1076
}
1077

    
1078
NEON_OP(subl_u32)
1079
{
1080
    T0 -= env->vfp.scratch[0];
1081
    T1 -= env->vfp.scratch[1];
1082
    FORCE_RET();
1083
}
1084

    
1085
NEON_OP(subl_u64)
1086
{
1087
    uint64_t tmp;
1088
    tmp = T0 | ((uint64_t)T1 << 32);
1089
    tmp -= env->vfp.scratch[0];
1090
    tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1091
    T0 = tmp;
1092
    T1 = tmp >> 32;
1093
    FORCE_RET();
1094
}
1095

    
1096
#define DO_ABD(dest, x, y, type) do { \
1097
    type tmp_x = x; \
1098
    type tmp_y = y; \
1099
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1100
    } while(0)
1101

    
1102
NEON_OP(abdl_u16)
1103
{
1104
    uint32_t tmp;
1105
    uint32_t low;
1106
    uint32_t high;
1107

    
1108
    DO_ABD(low, T0, T1, uint8_t);
1109
    DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1110
    low |= tmp << 16;
1111
    DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1112
    DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1113
    high |= tmp << 16;
1114
    T0 = low;
1115
    T1 = high;
1116
    FORCE_RET();
1117
}
1118

    
1119
NEON_OP(abdl_s16)
1120
{
1121
    uint32_t tmp;
1122
    uint32_t low;
1123
    uint32_t high;
1124

    
1125
    DO_ABD(low, T0, T1, int8_t);
1126
    DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1127
    low |= tmp << 16;
1128
    DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1129
    DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1130
    high |= tmp << 16;
1131
    T0 = low;
1132
    T1 = high;
1133
    FORCE_RET();
1134
}
1135

    
1136
NEON_OP(abdl_u32)
1137
{
1138
    uint32_t low;
1139
    uint32_t high;
1140

    
1141
    DO_ABD(low, T0, T1, uint16_t);
1142
    DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1143
    T0 = low;
1144
    T1 = high;
1145
    FORCE_RET();
1146
}
1147

    
1148
NEON_OP(abdl_s32)
1149
{
1150
    uint32_t low;
1151
    uint32_t high;
1152

    
1153
    DO_ABD(low, T0, T1, int16_t);
1154
    DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1155
    T0 = low;
1156
    T1 = high;
1157
    FORCE_RET();
1158
}
1159

    
1160
NEON_OP(abdl_u64)
1161
{
1162
    DO_ABD(T0, T0, T1, uint32_t);
1163
    T1 = 0;
1164
}
1165

    
1166
NEON_OP(abdl_s64)
1167
{
1168
    DO_ABD(T0, T0, T1, int32_t);
1169
    T1 = 0;
1170
}
1171
#undef DO_ABD
1172

    
1173
/* Widening multiple. Named type is the source type.  */
1174
#define DO_MULL(dest, x, y, type1, type2) do { \
1175
    type1 tmp_x = x; \
1176
    type1 tmp_y = y; \
1177
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1178
    } while(0)
1179

    
1180
NEON_OP(mull_u8)
1181
{
1182
    uint32_t tmp;
1183
    uint32_t low;
1184
    uint32_t high;
1185

    
1186
    DO_MULL(low, T0, T1, uint8_t, uint16_t);
1187
    DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1188
    low |= tmp << 16;
1189
    DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1190
    DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1191
    high |= tmp << 16;
1192
    T0 = low;
1193
    T1 = high;
1194
    FORCE_RET();
1195
}
1196

    
1197
NEON_OP(mull_s8)
1198
{
1199
    uint32_t tmp;
1200
    uint32_t low;
1201
    uint32_t high;
1202

    
1203
    DO_MULL(low, T0, T1, int8_t, uint16_t);
1204
    DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1205
    low |= tmp << 16;
1206
    DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1207
    DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1208
    high |= tmp << 16;
1209
    T0 = low;
1210
    T1 = high;
1211
    FORCE_RET();
1212
}
1213

    
1214
NEON_OP(mull_u16)
1215
{
1216
    uint32_t low;
1217
    uint32_t high;
1218

    
1219
    DO_MULL(low, T0, T1, uint16_t, uint32_t);
1220
    DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1221
    T0 = low;
1222
    T1 = high;
1223
    FORCE_RET();
1224
}
1225

    
1226
NEON_OP(mull_s16)
1227
{
1228
    uint32_t low;
1229
    uint32_t high;
1230

    
1231
    DO_MULL(low, T0, T1, int16_t, uint32_t);
1232
    DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1233
    T0 = low;
1234
    T1 = high;
1235
    FORCE_RET();
1236
}
1237

    
1238
NEON_OP(addl_saturate_s32)
1239
{
1240
    uint32_t tmp;
1241
    uint32_t res;
1242

    
1243
    tmp = env->vfp.scratch[0];
1244
    res = T0 + tmp;
1245
    if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1246
        env->QF = 1;
1247
        T0 = (T0 >> 31) ^ 0x7fffffff;
1248
    } else {
1249
      T0 = res;
1250
    }
1251
    tmp = env->vfp.scratch[1];
1252
    res = T1 + tmp;
1253
    if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1254
        env->QF = 1;
1255
        T1 = (T1 >> 31) ^ 0x7fffffff;
1256
    } else {
1257
      T1 = res;
1258
    }
1259
    FORCE_RET();
1260
}
1261

    
1262
NEON_OP(addl_saturate_s64)
1263
{
1264
    uint64_t src1;
1265
    uint64_t src2;
1266
    uint64_t res;
1267

    
1268
    src1 = T0 + ((uint64_t)T1 << 32);
1269
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1270
    res = src1 + src2;
1271
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1272
        env->QF = 1;
1273
        T0 = ~(int64_t)src1 >> 63;
1274
        T1 = T0 ^ 0x80000000;
1275
    } else {
1276
      T0 = res;
1277
      T1 = res >> 32;
1278
    }
1279
    FORCE_RET();
1280
}
1281

    
1282
NEON_OP(addl_saturate_u64)
1283
{
1284
    uint64_t src1;
1285
    uint64_t src2;
1286
    uint64_t res;
1287

    
1288
    src1 = T0 + ((uint64_t)T1 << 32);
1289
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1290
    res = src1 + src2;
1291
    if (res < src1) {
1292
        env->QF = 1;
1293
        T0 = 0xffffffff;
1294
        T1 = 0xffffffff;
1295
    } else {
1296
      T0 = res;
1297
      T1 = res >> 32;
1298
    }
1299
    FORCE_RET();
1300
}
1301

    
1302
NEON_OP(subl_saturate_s64)
1303
{
1304
    uint64_t src1;
1305
    uint64_t src2;
1306
    uint64_t res;
1307

    
1308
    src1 = T0 + ((uint64_t)T1 << 32);
1309
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1310
    res = src1 - src2;
1311
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1312
        env->QF = 1;
1313
        T0 = ~(int64_t)src1 >> 63;
1314
        T1 = T0 ^ 0x80000000;
1315
    } else {
1316
      T0 = res;
1317
      T1 = res >> 32;
1318
    }
1319
    FORCE_RET();
1320
}
1321

    
1322
NEON_OP(subl_saturate_u64)
1323
{
1324
    uint64_t src1;
1325
    uint64_t src2;
1326
    uint64_t res;
1327

    
1328
    src1 = T0 + ((uint64_t)T1 << 32);
1329
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1330
    if (src1 < src2) {
1331
        env->QF = 1;
1332
        T0 = 0;
1333
        T1 = 0;
1334
    } else {
1335
      res = src1 - src2;
1336
      T0 = res;
1337
      T1 = res >> 32;
1338
    }
1339
    FORCE_RET();
1340
}
1341

    
1342
NEON_OP(negl_u16)
1343
{
1344
    uint32_t tmp;
1345
    tmp = T0 >> 16;
1346
    tmp = -tmp;
1347
    T0 = (-T0 & 0xffff) | (tmp << 16);
1348
    tmp = T1 >> 16;
1349
    tmp = -tmp;
1350
    T1 = (-T1 & 0xffff) | (tmp << 16);
1351
    FORCE_RET();
1352
}
1353

    
1354
NEON_OP(negl_u32)
1355
{
1356
    T0 = -T0;
1357
    T1 = -T1;
1358
    FORCE_RET();
1359
}
1360

    
1361
NEON_OP(negl_u64)
1362
{
1363
    uint64_t val;
1364

    
1365
    val = T0 | ((uint64_t)T1 << 32);
1366
    val = -val;
1367
    T0 = val;
1368
    T1 = val >> 32;
1369
    FORCE_RET();
1370
}
1371

    
1372
/* Scalar operations.  */
1373
NEON_OP(dup_low16)
1374
{
1375
    T0 = (T0 & 0xffff) | (T0 << 16);
1376
    FORCE_RET();
1377
}
1378

    
1379
NEON_OP(dup_high16)
1380
{
1381
    T0 = (T0 >> 16) | (T0 & 0xffff0000);
1382
    FORCE_RET();
1383
}
1384

    
1385
/* Helper for VEXT */
1386
NEON_OP(extract)
1387
{
1388
    int shift = PARAM1;
1389
    T0 = (T0 >> shift) | (T1 << (32 - shift));
1390
    FORCE_RET();
1391
}
1392

    
1393
/* Pairwise add long.  Named type is source type.  */
1394
NEON_OP(paddl_s8)
1395
{
1396
    int8_t src1;
1397
    int8_t src2;
1398
    uint16_t result;
1399
    src1 = T0 >> 24;
1400
    src2 = T0 >> 16;
1401
    result = (uint16_t)src1 + src2;
1402
    src1 = T0 >> 8;
1403
    src2 = T0;
1404
    T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1405
    FORCE_RET();
1406
}
1407

    
1408
NEON_OP(paddl_u8)
1409
{
1410
    uint8_t src1;
1411
    uint8_t src2;
1412
    uint16_t result;
1413
    src1 = T0 >> 24;
1414
    src2 = T0 >> 16;
1415
    result = (uint16_t)src1 + src2;
1416
    src1 = T0 >> 8;
1417
    src2 = T0;
1418
    T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1419
    FORCE_RET();
1420
}
1421

    
1422
NEON_OP(paddl_s16)
1423
{
1424
    T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1425
    FORCE_RET();
1426
}
1427

    
1428
NEON_OP(paddl_u16)
1429
{
1430
    T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1431
    FORCE_RET();
1432
}
1433

    
1434
NEON_OP(paddl_s32)
1435
{
1436
    int64_t tmp;
1437
    tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1438
    T0 = tmp;
1439
    T1 = tmp >> 32;
1440
    FORCE_RET();
1441
}
1442

    
1443
NEON_OP(paddl_u32)
1444
{
1445
    uint64_t tmp;
1446
    tmp = (uint64_t)T0 + (uint64_t)T1;
1447
    T0 = tmp;
1448
    T1 = tmp >> 32;
1449
    FORCE_RET();
1450
}
1451

    
1452
/* Count Leading Sign/Zero Bits.  */
1453
static inline int do_clz8(uint8_t x)
1454
{
1455
    int n;
1456
    for (n = 8; x; n--)
1457
        x >>= 1;
1458
    return n;
1459
}
1460

    
1461
static inline int do_clz16(uint16_t x)
1462
{
1463
    int n;
1464
    for (n = 16; x; n--)
1465
        x >>= 1;
1466
    return n;
1467
}
1468

    
1469
NEON_OP(clz_u8)
1470
{
1471
    uint32_t result;
1472
    uint32_t tmp;
1473

    
1474
    tmp = T0;
1475
    result = do_clz8(tmp);
1476
    result |= do_clz8(tmp >> 8) << 8;
1477
    result |= do_clz8(tmp >> 16) << 16;
1478
    result |= do_clz8(tmp >> 24) << 24;
1479
    T0 = result;
1480
    FORCE_RET();
1481
}
1482

    
1483
NEON_OP(clz_u16)
1484
{
1485
    uint32_t result;
1486
    uint32_t tmp;
1487
    tmp = T0;
1488
    result = do_clz16(tmp);
1489
    result |= do_clz16(tmp >> 16) << 16;
1490
    T0 = result;
1491
    FORCE_RET();
1492
}
1493

    
1494
NEON_OP(cls_s8)
1495
{
1496
    uint32_t result;
1497
    int8_t tmp;
1498
    tmp = T0;
1499
    result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1500
    tmp = T0 >> 8;
1501
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1502
    tmp = T0 >> 16;
1503
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1504
    tmp = T0 >> 24;
1505
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1506
    T0 = result;
1507
    FORCE_RET();
1508
}
1509

    
1510
NEON_OP(cls_s16)
1511
{
1512
    uint32_t result;
1513
    int16_t tmp;
1514
    tmp = T0;
1515
    result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1516
    tmp = T0 >> 16;
1517
    result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1518
    T0 = result;
1519
    FORCE_RET();
1520
}
1521

    
1522
NEON_OP(cls_s32)
1523
{
1524
    int count;
1525
    if ((int32_t)T0 < 0)
1526
        T0 = ~T0;
1527
    for (count = 32; T0 > 0; count--)
1528
        T0 = T0 >> 1;
1529
    T0 = count - 1;
1530
    FORCE_RET();
1531
}
1532

    
1533
/* Bit count.  */
1534
NEON_OP(cnt_u8)
1535
{
1536
    T0 = (T0 & 0x55555555) + ((T0 >>  1) & 0x55555555);
1537
    T0 = (T0 & 0x33333333) + ((T0 >>  2) & 0x33333333);
1538
    T0 = (T0 & 0x0f0f0f0f) + ((T0 >>  4) & 0x0f0f0f0f);
1539
    FORCE_RET();
1540
}
1541

    
1542
/* Saturnating negation.  */
1543
/* ??? Make these use NEON_VOP1 */
1544
#define DO_QABS8(x) do { \
1545
    if (x == (int8_t)0x80) { \
1546
        x = 0x7f; \
1547
        env->QF = 1; \
1548
    } else if (x < 0) { \
1549
        x = -x; \
1550
    }} while (0)
1551
NEON_OP(qabs_s8)
1552
{
1553
    neon_s8 vec;
1554
    NEON_UNPACK(neon_s8, vec, T0);
1555
    DO_QABS8(vec.v1);
1556
    DO_QABS8(vec.v2);
1557
    DO_QABS8(vec.v3);
1558
    DO_QABS8(vec.v4);
1559
    NEON_PACK(neon_s8, T0, vec);
1560
    FORCE_RET();
1561
}
1562
#undef DO_QABS8
1563

    
1564
#define DO_QNEG8(x) do { \
1565
    if (x == (int8_t)0x80) { \
1566
        x = 0x7f; \
1567
        env->QF = 1; \
1568
    } else { \
1569
        x = -x; \
1570
    }} while (0)
1571
NEON_OP(qneg_s8)
1572
{
1573
    neon_s8 vec;
1574
    NEON_UNPACK(neon_s8, vec, T0);
1575
    DO_QNEG8(vec.v1);
1576
    DO_QNEG8(vec.v2);
1577
    DO_QNEG8(vec.v3);
1578
    DO_QNEG8(vec.v4);
1579
    NEON_PACK(neon_s8, T0, vec);
1580
    FORCE_RET();
1581
}
1582
#undef DO_QNEG8
1583

    
1584
#define DO_QABS16(x) do { \
1585
    if (x == (int16_t)0x8000) { \
1586
        x = 0x7fff; \
1587
        env->QF = 1; \
1588
    } else if (x < 0) { \
1589
        x = -x; \
1590
    }} while (0)
1591
NEON_OP(qabs_s16)
1592
{
1593
    neon_s16 vec;
1594
    NEON_UNPACK(neon_s16, vec, T0);
1595
    DO_QABS16(vec.v1);
1596
    DO_QABS16(vec.v2);
1597
    NEON_PACK(neon_s16, T0, vec);
1598
    FORCE_RET();
1599
}
1600
#undef DO_QABS16
1601

    
1602
#define DO_QNEG16(x) do { \
1603
    if (x == (int16_t)0x8000) { \
1604
        x = 0x7fff; \
1605
        env->QF = 1; \
1606
    } else { \
1607
        x = -x; \
1608
    }} while (0)
1609
NEON_OP(qneg_s16)
1610
{
1611
    neon_s16 vec;
1612
    NEON_UNPACK(neon_s16, vec, T0);
1613
    DO_QNEG16(vec.v1);
1614
    DO_QNEG16(vec.v2);
1615
    NEON_PACK(neon_s16, T0, vec);
1616
    FORCE_RET();
1617
}
1618
#undef DO_QNEG16
1619

    
1620
NEON_OP(qabs_s32)
1621
{
1622
    if (T0 == 0x80000000) {
1623
        T0 = 0x7fffffff;
1624
        env->QF = 1;
1625
    } else if ((int32_t)T0 < 0) {
1626
        T0 = -T0;
1627
    }
1628
    FORCE_RET();
1629
}
1630

    
1631
NEON_OP(qneg_s32)
1632
{
1633
    if (T0 == 0x80000000) {
1634
        T0 = 0x7fffffff;
1635
        env->QF = 1;
1636
    } else {
1637
        T0 = -T0;
1638
    }
1639
    FORCE_RET();
1640
}
1641

    
1642
/* Unary opperations */
1643
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1644
NEON_VOP1(abs_s8, neon_s8, 4)
1645
NEON_VOP1(abs_s16, neon_s16, 2)
1646
NEON_OP(abs_s32)
1647
{
1648
    if ((int32_t)T0 < 0)
1649
        T0 = -T0;
1650
    FORCE_RET();
1651
}
1652
#undef NEON_FN
1653

    
1654
/* Transpose.  Argument order is rather strange to avoid special casing
1655
   the tranlation code.
1656
   On input T0 = rm, T1 = rd.  On output T0 = rd, T1 = rm  */
1657
NEON_OP(trn_u8)
1658
{
1659
    uint32_t rd;
1660
    uint32_t rm;
1661
    rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1662
    rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1663
    T0 = rd;
1664
    T1 = rm;
1665
    FORCE_RET();
1666
}
1667

    
1668
NEON_OP(trn_u16)
1669
{
1670
    uint32_t rd;
1671
    uint32_t rm;
1672
    rd = (T0 << 16) | (T1 & 0xffff);
1673
    rm = (T1 >> 16) | (T0 & 0xffff0000);
1674
    T0 = rd;
1675
    T1 = rm;
1676
    FORCE_RET();
1677
}
1678

    
1679
/* Worker routines for zip and unzip.  */
1680
NEON_OP(unzip_u8)
1681
{
1682
    uint32_t rd;
1683
    uint32_t rm;
1684
    rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1685
         | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1686
    rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1687
         | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1688
    T0 = rd;
1689
    T1 = rm;
1690
    FORCE_RET();
1691
}
1692

    
1693
NEON_OP(zip_u8)
1694
{
1695
    uint32_t rd;
1696
    uint32_t rm;
1697
    rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1698
         | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1699
    rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1700
         | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1701
    T0 = rd;
1702
    T1 = rm;
1703
    FORCE_RET();
1704
}
1705

    
1706
NEON_OP(zip_u16)
1707
{
1708
    uint32_t tmp;
1709

    
1710
    tmp = (T0 & 0xffff) | (T1 << 16);
1711
    T1 = (T1 & 0xffff0000) | (T0 >> 16);
1712
    T0 = tmp;
1713
    FORCE_RET();
1714
}
1715

    
1716
/* Table lookup.  This accessed the register file directly.  */
1717
NEON_OP(tbl)
1718
{
1719
    helper_neon_tbl(PARAM1, PARAM2);
1720
}
1721

    
1722
NEON_OP(dup_u8)
1723
{
1724
    T0 = (T0 >> PARAM1) & 0xff;
1725
    T0 |= T0 << 8;
1726
    T0 |= T0 << 16;
1727
    FORCE_RET();
1728
}
1729

    
1730
/* Helpers for element load/store.  */
1731
NEON_OP(insert_elt)
1732
{
1733
    int shift = PARAM1;
1734
    uint32_t mask = PARAM2;
1735
    T2 = (T2 & mask) | (T0 << shift);
1736
    FORCE_RET();
1737
}
1738

    
1739
NEON_OP(extract_elt)
1740
{
1741
    int shift = PARAM1;
1742
    uint32_t mask = PARAM2;
1743
    T0 = (T2 & mask) >> shift;
1744
    FORCE_RET();
1745
}