Statistics
| Branch: | Revision:

root / target-arm / op_neon.h @ 8f8e3aa4

History | View | Annotate | Download (34.6 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GPL.
8
 */
9
/* Note that for NEON an "l" prefix means it is a wide operation, unlike
10
   scalar arm ops where it means a word size operation.  */
11

    
12
#define SIGNBIT (uint32_t)0x80000000
13
/* ??? NEON ops should probably have their own float status.  */
14
#define NFS &env->vfp.fp_status
15
#define NEON_OP(name) void OPPROTO op_neon_##name (void)
16

    
17
/* Helper routines to perform bitwise copies between float and int.  */
18
static inline float32 vfp_itos(uint32_t i)
19
{
20
    union {
21
        uint32_t i;
22
        float32 s;
23
    } v;
24

    
25
    v.i = i;
26
    return v.s;
27
}
28

    
29
static inline uint32_t vfp_stoi(float32 s)
30
{
31
    union {
32
        uint32_t i;
33
        float32 s;
34
    } v;
35

    
36
    v.s = s;
37
    return v.i;
38
}
39

    
40
NEON_OP(getreg_T0)
41
{
42
    T0 = *(uint32_t *)((char *) env + PARAM1);
43
}
44

    
45
NEON_OP(getreg_T1)
46
{
47
    T1 = *(uint32_t *)((char *) env + PARAM1);
48
}
49

    
50
NEON_OP(setreg_T0)
51
{
52
    *(uint32_t *)((char *) env + PARAM1) = T0;
53
}
54

    
55
NEON_OP(setreg_T1)
56
{
57
    *(uint32_t *)((char *) env + PARAM1) = T1;
58
}
59

    
60
#define NEON_TYPE1(name, type) \
61
typedef struct \
62
{ \
63
    type v1; \
64
} neon_##name;
65
#ifdef WORDS_BIGENDIAN
66
#define NEON_TYPE2(name, type) \
67
typedef struct \
68
{ \
69
    type v2; \
70
    type v1; \
71
} neon_##name;
72
#define NEON_TYPE4(name, type) \
73
typedef struct \
74
{ \
75
    type v4; \
76
    type v3; \
77
    type v2; \
78
    type v1; \
79
} neon_##name;
80
#else
81
#define NEON_TYPE2(name, type) \
82
typedef struct \
83
{ \
84
    type v1; \
85
    type v2; \
86
} neon_##name;
87
#define NEON_TYPE4(name, type) \
88
typedef struct \
89
{ \
90
    type v1; \
91
    type v2; \
92
    type v3; \
93
    type v4; \
94
} neon_##name;
95
#endif
96

    
97
NEON_TYPE4(s8, int8_t)
98
NEON_TYPE4(u8, uint8_t)
99
NEON_TYPE2(s16, int16_t)
100
NEON_TYPE2(u16, uint16_t)
101
NEON_TYPE1(s32, int32_t)
102
NEON_TYPE1(u32, uint32_t)
103
#undef NEON_TYPE4
104
#undef NEON_TYPE2
105
#undef NEON_TYPE1
106

    
107
/* Copy from a uint32_t to a vector structure type.  */
108
#define NEON_UNPACK(vtype, dest, val) do { \
109
    union { \
110
        vtype v; \
111
        uint32_t i; \
112
    } conv_u; \
113
    conv_u.i = (val); \
114
    dest = conv_u.v; \
115
    } while(0)
116

    
117
/* Copy from a vector structure type to a uint32_t.  */
118
#define NEON_PACK(vtype, dest, val) do { \
119
    union { \
120
        vtype v; \
121
        uint32_t i; \
122
    } conv_u; \
123
    conv_u.v = (val); \
124
    dest = conv_u.i; \
125
    } while(0)
126

    
127
#define NEON_DO1 \
128
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
129
#define NEON_DO2 \
130
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
131
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
132
#define NEON_DO4 \
133
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
134
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
135
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
136
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
137

    
138
#define NEON_VOP(name, vtype, n) \
139
NEON_OP(name) \
140
{ \
141
    vtype vsrc1; \
142
    vtype vsrc2; \
143
    vtype vdest; \
144
    NEON_UNPACK(vtype, vsrc1, T0); \
145
    NEON_UNPACK(vtype, vsrc2, T1); \
146
    NEON_DO##n; \
147
    NEON_PACK(vtype, T0, vdest); \
148
    FORCE_RET(); \
149
}
150

    
151
#define NEON_VOP1(name, vtype, n) \
152
NEON_OP(name) \
153
{ \
154
    vtype vsrc1; \
155
    vtype vdest; \
156
    NEON_UNPACK(vtype, vsrc1, T0); \
157
    NEON_DO##n; \
158
    NEON_PACK(vtype, T0, vdest); \
159
    FORCE_RET(); \
160
}
161

    
162
/* Pairwise operations.  */
163
/* For 32-bit elements each segment only contains a single element, so
164
   the elementwise and pairwise operations are the same.  */
165
#define NEON_PDO2 \
166
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
167
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
168
#define NEON_PDO4 \
169
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
170
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
171
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
172
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
173

    
174
#define NEON_POP(name, vtype, n) \
175
NEON_OP(name) \
176
{ \
177
    vtype vsrc1; \
178
    vtype vsrc2; \
179
    vtype vdest; \
180
    NEON_UNPACK(vtype, vsrc1, T0); \
181
    NEON_UNPACK(vtype, vsrc2, T1); \
182
    NEON_PDO##n; \
183
    NEON_PACK(vtype, T0, vdest); \
184
    FORCE_RET(); \
185
}
186

    
187
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
188
NEON_VOP(hadd_s8, neon_s8, 4)
189
NEON_VOP(hadd_u8, neon_u8, 4)
190
NEON_VOP(hadd_s16, neon_s16, 2)
191
NEON_VOP(hadd_u16, neon_u16, 2)
192
#undef NEON_FN
193

    
194
NEON_OP(hadd_s32)
195
{
196
    int32_t src1 = T0;
197
    int32_t src2 = T1;
198
    int32_t dest;
199

    
200
    dest = (src1 >> 1) + (src2 >> 1);
201
    if (src1 & src2 & 1)
202
        dest++;
203
    T0 = dest;
204
    FORCE_RET();
205
}
206

    
207
NEON_OP(hadd_u32)
208
{
209
    uint32_t src1 = T0;
210
    uint32_t src2 = T1;
211
    uint32_t dest;
212

    
213
    dest = (src1 >> 1) + (src2 >> 1);
214
    if (src1 & src2 & 1)
215
        dest++;
216
    T0 = dest;
217
    FORCE_RET();
218
}
219

    
220
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
221
NEON_VOP(rhadd_s8, neon_s8, 4)
222
NEON_VOP(rhadd_u8, neon_u8, 4)
223
NEON_VOP(rhadd_s16, neon_s16, 2)
224
NEON_VOP(rhadd_u16, neon_u16, 2)
225
#undef NEON_FN
226

    
227
NEON_OP(rhadd_s32)
228
{
229
    int32_t src1 = T0;
230
    int32_t src2 = T1;
231
    int32_t dest;
232

    
233
    dest = (src1 >> 1) + (src2 >> 1);
234
    if ((src1 | src2) & 1)
235
        dest++;
236
    T0 = dest;
237
    FORCE_RET();
238
}
239

    
240
NEON_OP(rhadd_u32)
241
{
242
    uint32_t src1 = T0;
243
    uint32_t src2 = T1;
244
    uint32_t dest;
245

    
246
    dest = (src1 >> 1) + (src2 >> 1);
247
    if ((src1 | src2) & 1)
248
        dest++;
249
    T0 = dest;
250
    FORCE_RET();
251
}
252

    
253
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
254
NEON_VOP(hsub_s8, neon_s8, 4)
255
NEON_VOP(hsub_u8, neon_u8, 4)
256
NEON_VOP(hsub_s16, neon_s16, 2)
257
NEON_VOP(hsub_u16, neon_u16, 2)
258
#undef NEON_FN
259

    
260
NEON_OP(hsub_s32)
261
{
262
    int32_t src1 = T0;
263
    int32_t src2 = T1;
264
    int32_t dest;
265

    
266
    dest = (src1 >> 1) - (src2 >> 1);
267
    if ((~src1) & src2 & 1)
268
        dest--;
269
    T0 = dest;
270
    FORCE_RET();
271
}
272

    
273
NEON_OP(hsub_u32)
274
{
275
    uint32_t src1 = T0;
276
    uint32_t src2 = T1;
277
    uint32_t dest;
278

    
279
    dest = (src1 >> 1) - (src2 >> 1);
280
    if ((~src1) & src2 & 1)
281
        dest--;
282
    T0 = dest;
283
    FORCE_RET();
284
}
285

    
286
#define NEON_USAT(dest, src1, src2, type) do { \
287
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
288
    if (tmp != (type)tmp) { \
289
        env->QF = 1; \
290
        dest = ~0; \
291
    } else { \
292
        dest = tmp; \
293
    }} while(0)
294
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
295
NEON_VOP(qadd_u8, neon_u8, 4)
296
#undef NEON_FN
297
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
298
NEON_VOP(qadd_u16, neon_u16, 2)
299
#undef NEON_FN
300
#undef NEON_USAT
301

    
302
#define NEON_SSAT(dest, src1, src2, type) do { \
303
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
304
    if (tmp != (type)tmp) { \
305
        env->QF = 1; \
306
        if (src2 > 0) { \
307
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
308
        } else { \
309
            tmp = 1 << (sizeof(type) * 8 - 1); \
310
        } \
311
    } \
312
    dest = tmp; \
313
    } while(0)
314
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
315
NEON_VOP(qadd_s8, neon_s8, 4)
316
#undef NEON_FN
317
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
318
NEON_VOP(qadd_s16, neon_s16, 2)
319
#undef NEON_FN
320
#undef NEON_SSAT
321

    
322
#define NEON_USAT(dest, src1, src2, type) do { \
323
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
324
    if (tmp != (type)tmp) { \
325
        env->QF = 1; \
326
        dest = 0; \
327
    } else { \
328
        dest = tmp; \
329
    }} while(0)
330
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
331
NEON_VOP(qsub_u8, neon_u8, 4)
332
#undef NEON_FN
333
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
334
NEON_VOP(qsub_u16, neon_u16, 2)
335
#undef NEON_FN
336
#undef NEON_USAT
337

    
338
#define NEON_SSAT(dest, src1, src2, type) do { \
339
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
340
    if (tmp != (type)tmp) { \
341
        env->QF = 1; \
342
        if (src2 < 0) { \
343
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
344
        } else { \
345
            tmp = 1 << (sizeof(type) * 8 - 1); \
346
        } \
347
    } \
348
    dest = tmp; \
349
    } while(0)
350
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
351
NEON_VOP(qsub_s8, neon_s8, 4)
352
#undef NEON_FN
353
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
354
NEON_VOP(qsub_s16, neon_s16, 2)
355
#undef NEON_FN
356
#undef NEON_SSAT
357

    
358
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
359
NEON_VOP(cgt_s8, neon_s8, 4)
360
NEON_VOP(cgt_u8, neon_u8, 4)
361
NEON_VOP(cgt_s16, neon_s16, 2)
362
NEON_VOP(cgt_u16, neon_u16, 2)
363
NEON_VOP(cgt_s32, neon_s32, 1)
364
NEON_VOP(cgt_u32, neon_u32, 1)
365
#undef NEON_FN
366

    
367
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
368
NEON_VOP(cge_s8, neon_s8, 4)
369
NEON_VOP(cge_u8, neon_u8, 4)
370
NEON_VOP(cge_s16, neon_s16, 2)
371
NEON_VOP(cge_u16, neon_u16, 2)
372
NEON_VOP(cge_s32, neon_s32, 1)
373
NEON_VOP(cge_u32, neon_u32, 1)
374
#undef NEON_FN
375

    
376
#define NEON_FN(dest, src1, src2) do { \
377
    int8_t tmp; \
378
    tmp = (int8_t)src2; \
379
    if (tmp < 0) { \
380
        dest = src1 >> -tmp; \
381
    } else { \
382
        dest = src1 << tmp; \
383
    }} while (0)
384
NEON_VOP(shl_s8, neon_s8, 4)
385
NEON_VOP(shl_u8, neon_u8, 4)
386
NEON_VOP(shl_s16, neon_s16, 2)
387
NEON_VOP(shl_u16, neon_u16, 2)
388
NEON_VOP(shl_s32, neon_s32, 1)
389
NEON_VOP(shl_u32, neon_u32, 1)
390
#undef NEON_FN
391

    
392
NEON_OP(shl_u64)
393
{
394
    int8_t shift = env->vfp.scratch[0];
395
    uint64_t val = T0 | ((uint64_t)T1 << 32);
396
    if (shift < 0) {
397
        val >>= -shift;
398
    } else {
399
        val <<= shift;
400
    }
401
    T0 = val;
402
    T1 = val >> 32;
403
    FORCE_RET();
404
}
405

    
406
NEON_OP(shl_s64)
407
{
408
    int8_t shift = env->vfp.scratch[0];
409
    int64_t val = T0 | ((uint64_t)T1 << 32);
410
    if (shift < 0) {
411
        val >>= -shift;
412
    } else {
413
        val <<= shift;
414
    }
415
    T0 = val;
416
    T1 = val >> 32;
417
    FORCE_RET();
418
}
419

    
420
#define NEON_FN(dest, src1, src2) do { \
421
    int8_t tmp; \
422
    tmp = (int8_t)src1; \
423
    if (tmp < 0) { \
424
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
425
    } else { \
426
        dest = src2 << tmp; \
427
    }} while (0)
428

    
429
NEON_VOP(rshl_s8, neon_s8, 4)
430
NEON_VOP(rshl_u8, neon_u8, 4)
431
NEON_VOP(rshl_s16, neon_s16, 2)
432
NEON_VOP(rshl_u16, neon_u16, 2)
433
NEON_VOP(rshl_s32, neon_s32, 1)
434
NEON_VOP(rshl_u32, neon_u32, 1)
435
#undef NEON_FN
436

    
437
NEON_OP(rshl_u64)
438
{
439
    int8_t shift = env->vfp.scratch[0];
440
    uint64_t val = T0 | ((uint64_t)T1 << 32);
441
    if (shift < 0) {
442
        val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
443
        val >>= -shift;
444
    } else {
445
        val <<= shift;
446
    }
447
    T0 = val;
448
    T1 = val >> 32;
449
    FORCE_RET();
450
}
451

    
452
NEON_OP(rshl_s64)
453
{
454
    int8_t shift = env->vfp.scratch[0];
455
    int64_t val = T0 | ((uint64_t)T1 << 32);
456
    if (shift < 0) {
457
        val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
458
    } else {
459
        val <<= shift;
460
    }
461
    T0 = val;
462
    T1 = val >> 32;
463
    FORCE_RET();
464
}
465

    
466
#define NEON_FN(dest, src1, src2) do { \
467
    int8_t tmp; \
468
    tmp = (int8_t)src1; \
469
    if (tmp < 0) { \
470
        dest = src2 >> -tmp; \
471
    } else { \
472
        dest = src2 << tmp; \
473
        if ((dest >> tmp) != src2) { \
474
            env->QF = 1; \
475
            dest = ~0; \
476
        } \
477
    }} while (0)
478
NEON_VOP(qshl_s8, neon_s8, 4)
479
NEON_VOP(qshl_s16, neon_s16, 2)
480
NEON_VOP(qshl_s32, neon_s32, 1)
481
#undef NEON_FN
482

    
483
NEON_OP(qshl_s64)
484
{
485
    int8_t shift = env->vfp.scratch[0];
486
    int64_t val = T0 | ((uint64_t)T1 << 32);
487
    if (shift < 0) {
488
        val >>= -shift;
489
    } else {
490
        int64_t tmp = val;
491
        val <<= shift;
492
        if ((val >> shift) != tmp) {
493
            env->QF = 1;
494
            val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
495
        }
496
    }
497
    T0 = val;
498
    T1 = val >> 32;
499
    FORCE_RET();
500
}
501

    
502
#define NEON_FN(dest, src1, src2) do { \
503
    int8_t tmp; \
504
    tmp = (int8_t)src1; \
505
    if (tmp < 0) { \
506
        dest = src2 >> -tmp; \
507
    } else { \
508
        dest = src2 << tmp; \
509
        if ((dest >> tmp) != src2) { \
510
            env->QF = 1; \
511
            dest = src2 >> 31; \
512
        } \
513
    }} while (0)
514
NEON_VOP(qshl_u8, neon_u8, 4)
515
NEON_VOP(qshl_u16, neon_u16, 2)
516
NEON_VOP(qshl_u32, neon_u32, 1)
517
#undef NEON_FN
518

    
519
NEON_OP(qshl_u64)
520
{
521
    int8_t shift = env->vfp.scratch[0];
522
    uint64_t val = T0 | ((uint64_t)T1 << 32);
523
    if (shift < 0) {
524
        val >>= -shift;
525
    } else {
526
        uint64_t tmp = val;
527
        val <<= shift;
528
        if ((val >> shift) != tmp) {
529
            env->QF = 1;
530
            val = ~(uint64_t)0;
531
        }
532
    }
533
    T0 = val;
534
    T1 = val >> 32;
535
    FORCE_RET();
536
}
537

    
538
#define NEON_FN(dest, src1, src2) do { \
539
    int8_t tmp; \
540
    tmp = (int8_t)src1; \
541
    if (tmp < 0) { \
542
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
543
    } else { \
544
        dest = src2 << tmp; \
545
        if ((dest >> tmp) != src2) { \
546
            dest = ~0; \
547
        } \
548
    }} while (0)
549
NEON_VOP(qrshl_s8, neon_s8, 4)
550
NEON_VOP(qrshl_s16, neon_s16, 2)
551
NEON_VOP(qrshl_s32, neon_s32, 1)
552
#undef NEON_FN
553

    
554
#define NEON_FN(dest, src1, src2) do { \
555
    int8_t tmp; \
556
    tmp = (int8_t)src1; \
557
    if (tmp < 0) { \
558
        dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
559
    } else { \
560
        dest = src2 << tmp; \
561
        if ((dest >> tmp) != src2) { \
562
            env->QF = 1; \
563
            dest = src2 >> 31; \
564
        } \
565
    }} while (0)
566
NEON_VOP(qrshl_u8, neon_u8, 4)
567
NEON_VOP(qrshl_u16, neon_u16, 2)
568
NEON_VOP(qrshl_u32, neon_u32, 1)
569
#undef NEON_FN
570

    
571
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
572
NEON_VOP(max_s8, neon_s8, 4)
573
NEON_VOP(max_u8, neon_u8, 4)
574
NEON_VOP(max_s16, neon_s16, 2)
575
NEON_VOP(max_u16, neon_u16, 2)
576
NEON_VOP(max_s32, neon_s32, 1)
577
NEON_VOP(max_u32, neon_u32, 1)
578
NEON_POP(pmax_s8, neon_s8, 4)
579
NEON_POP(pmax_u8, neon_u8, 4)
580
NEON_POP(pmax_s16, neon_s16, 2)
581
NEON_POP(pmax_u16, neon_u16, 2)
582
#undef NEON_FN
583

    
584
NEON_OP(max_f32)
585
{
586
    float32 f0 = vfp_itos(T0);
587
    float32 f1 = vfp_itos(T1);
588
    T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
589
    FORCE_RET();
590
}
591

    
592
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
593
NEON_VOP(min_s8, neon_s8, 4)
594
NEON_VOP(min_u8, neon_u8, 4)
595
NEON_VOP(min_s16, neon_s16, 2)
596
NEON_VOP(min_u16, neon_u16, 2)
597
NEON_VOP(min_s32, neon_s32, 1)
598
NEON_VOP(min_u32, neon_u32, 1)
599
NEON_POP(pmin_s8, neon_s8, 4)
600
NEON_POP(pmin_u8, neon_u8, 4)
601
NEON_POP(pmin_s16, neon_s16, 2)
602
NEON_POP(pmin_u16, neon_u16, 2)
603
#undef NEON_FN
604

    
605
NEON_OP(min_f32)
606
{
607
    float32 f0 = vfp_itos(T0);
608
    float32 f1 = vfp_itos(T1);
609
    T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
610
    FORCE_RET();
611
}
612

    
613
#define NEON_FN(dest, src1, src2) \
614
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
615
NEON_VOP(abd_s8, neon_s8, 4)
616
NEON_VOP(abd_u8, neon_u8, 4)
617
NEON_VOP(abd_s16, neon_s16, 2)
618
NEON_VOP(abd_u16, neon_u16, 2)
619
NEON_VOP(abd_s32, neon_s32, 1)
620
NEON_VOP(abd_u32, neon_u32, 1)
621
#undef NEON_FN
622

    
623
NEON_OP(abd_f32)
624
{
625
    float32 f0 = vfp_itos(T0);
626
    float32 f1 = vfp_itos(T1);
627
    T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
628
                  ? float32_sub(f0, f1, NFS)
629
                  : float32_sub(f1, f0, NFS));
630
    FORCE_RET();
631
}
632

    
633
#define NEON_FN(dest, src1, src2) dest = src1 + src2
634
NEON_VOP(add_u8, neon_u8, 4)
635
NEON_VOP(add_u16, neon_u16, 2)
636
NEON_POP(padd_u8, neon_u8, 4)
637
NEON_POP(padd_u16, neon_u16, 2)
638
#undef NEON_FN
639

    
640
NEON_OP(add_f32)
641
{
642
    T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
643
    FORCE_RET();
644
}
645

    
646
#define NEON_FN(dest, src1, src2) dest = src1 - src2
647
NEON_VOP(sub_u8, neon_u8, 4)
648
NEON_VOP(sub_u16, neon_u16, 2)
649
#undef NEON_FN
650

    
651
NEON_OP(sub_f32)
652
{
653
    T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
654
    FORCE_RET();
655
}
656

    
657
#define NEON_FN(dest, src1, src2) dest = src2 - src1
658
NEON_VOP(rsb_u8, neon_u8, 4)
659
NEON_VOP(rsb_u16, neon_u16, 2)
660
#undef NEON_FN
661

    
662
NEON_OP(rsb_f32)
663
{
664
    T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
665
    FORCE_RET();
666
}
667

    
668
#define NEON_FN(dest, src1, src2) dest = src1 * src2
669
NEON_VOP(mul_u8, neon_u8, 4)
670
NEON_VOP(mul_u16, neon_u16, 2)
671
#undef NEON_FN
672

    
673
NEON_OP(mul_f32)
674
{
675
    T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
676
    FORCE_RET();
677
}
678

    
679
NEON_OP(mul_p8)
680
{
681
    T0 = helper_neon_mul_p8(T0, T1);
682
}
683

    
684
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
685
NEON_VOP(tst_u8, neon_u8, 4)
686
NEON_VOP(tst_u16, neon_u16, 2)
687
NEON_VOP(tst_u32, neon_u32, 1)
688
#undef NEON_FN
689

    
690
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
691
NEON_VOP(ceq_u8, neon_u8, 4)
692
NEON_VOP(ceq_u16, neon_u16, 2)
693
NEON_VOP(ceq_u32, neon_u32, 1)
694
#undef NEON_FN
695

    
696
#define NEON_QDMULH16(dest, src1, src2, round) do { \
697
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
698
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
699
        env->QF = 1; \
700
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
701
    } \
702
    tmp <<= 1; \
703
    if (round) { \
704
        int32_t old = tmp; \
705
        tmp += 1 << 15; \
706
        if ((int32_t)tmp < old) { \
707
            env->QF = 1; \
708
            tmp = SIGNBIT - 1; \
709
        } \
710
    } \
711
    dest = tmp >> 16; \
712
    } while(0)
713
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
714
NEON_VOP(qdmulh_s16, neon_s16, 2)
715
#undef NEON_FN
716
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
717
NEON_VOP(qrdmulh_s16, neon_s16, 2)
718
#undef NEON_FN
719
#undef NEON_QDMULH16
720

    
721
#define SIGNBIT64 ((uint64_t)1 << 63)
722
#define NEON_QDMULH32(dest, src1, src2, round) do { \
723
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
724
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
725
        env->QF = 1; \
726
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
727
    } else { \
728
        tmp <<= 1; \
729
    } \
730
    if (round) { \
731
        int64_t old = tmp; \
732
        tmp += (int64_t)1 << 31; \
733
        if ((int64_t)tmp < old) { \
734
            env->QF = 1; \
735
            tmp = SIGNBIT64 - 1; \
736
        } \
737
    } \
738
    dest = tmp >> 32; \
739
    } while(0)
740
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
741
NEON_VOP(qdmulh_s32, neon_s32, 1)
742
#undef NEON_FN
743
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
744
NEON_VOP(qrdmulh_s32, neon_s32, 1)
745
#undef NEON_FN
746
#undef NEON_QDMULH32
747

    
748
/* Floating point comparisons produce an integer result.  */
749
#define NEON_VOP_FCMP(name, cmp) \
750
NEON_OP(name) \
751
{ \
752
    if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
753
        T0 = -1; \
754
    else \
755
        T0 = 0; \
756
    FORCE_RET(); \
757
}
758

    
759
NEON_VOP_FCMP(ceq_f32, ==)
760
NEON_VOP_FCMP(cge_f32, >=)
761
NEON_VOP_FCMP(cgt_f32, >)
762

    
763
NEON_OP(acge_f32)
764
{
765
    float32 f0 = float32_abs(vfp_itos(T0));
766
    float32 f1 = float32_abs(vfp_itos(T1));
767
    T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
768
    FORCE_RET();
769
}
770

    
771
NEON_OP(acgt_f32)
772
{
773
    float32 f0 = float32_abs(vfp_itos(T0));
774
    float32 f1 = float32_abs(vfp_itos(T1));
775
    T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
776
    FORCE_RET();
777
}
778

    
779
/* Narrowing instructions.  The named type is the destination type.  */
780
NEON_OP(narrow_u8)
781
{
782
    T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
783
         | ((T1 << 16) & 0xff0000) | (T1 << 24);
784
    FORCE_RET();
785
}
786

    
787
NEON_OP(narrow_sat_u8)
788
{
789
    neon_u16 src;
790
    neon_u8 dest;
791
#define SAT8(d, s) \
792
    if (s > 0xff) { \
793
        d = 0xff; \
794
        env->QF = 1; \
795
    } else  { \
796
        d = s; \
797
    }
798

    
799
    NEON_UNPACK(neon_u16, src, T0);
800
    SAT8(dest.v1, src.v1);
801
    SAT8(dest.v2, src.v2);
802
    NEON_UNPACK(neon_u16, src, T1);
803
    SAT8(dest.v3, src.v1);
804
    SAT8(dest.v4, src.v2);
805
    NEON_PACK(neon_u8, T0, dest);
806
    FORCE_RET();
807
#undef SAT8
808
}
809

    
810
NEON_OP(narrow_sat_s8)
811
{
812
    neon_s16 src;
813
    neon_s8 dest;
814
#define SAT8(d, s) \
815
    if (s != (uint8_t)s) { \
816
        d = (s >> 15) ^ 0x7f; \
817
        env->QF = 1; \
818
    } else  { \
819
        d = s; \
820
    }
821

    
822
    NEON_UNPACK(neon_s16, src, T0);
823
    SAT8(dest.v1, src.v1);
824
    SAT8(dest.v2, src.v2);
825
    NEON_UNPACK(neon_s16, src, T1);
826
    SAT8(dest.v3, src.v1);
827
    SAT8(dest.v4, src.v2);
828
    NEON_PACK(neon_s8, T0, dest);
829
    FORCE_RET();
830
#undef SAT8
831
}
832

    
833
NEON_OP(narrow_u16)
834
{
835
    T0 = (T0 & 0xffff) | (T1 << 16);
836
}
837

    
838
NEON_OP(narrow_sat_u16)
839
{
840
    if (T0 > 0xffff) {
841
        T0 = 0xffff;
842
        env->QF = 1;
843
    }
844
    if (T1 > 0xffff) {
845
        T1 = 0xffff;
846
        env->QF = 1;
847
    }
848
    T0 |= T1 << 16;
849
    FORCE_RET();
850
}
851

    
852
NEON_OP(narrow_sat_s16)
853
{
854
    if ((int32_t)T0 != (int16_t)T0) {
855
        T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
856
        env->QF = 1;
857
    }
858
    if ((int32_t)T1 != (int16_t) T1) {
859
        T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
860
        env->QF = 1;
861
    }
862
    T0 = (uint16_t)T0 | (T1 << 16);
863
    FORCE_RET();
864
}
865

    
866
NEON_OP(narrow_sat_u32)
867
{
868
    if (T1) {
869
        T0 = 0xffffffffu;
870
        env->QF = 1;
871
    }
872
    FORCE_RET();
873
}
874

    
875
NEON_OP(narrow_sat_s32)
876
{
877
    int32_t sign = (int32_t)T1 >> 31;
878

    
879
    if ((int32_t)T1 != sign) {
880
        T0 = sign ^ 0x7fffffff;
881
        env->QF = 1;
882
    }
883
    FORCE_RET();
884
}
885

    
886
/* Narrowing instructions.  Named type is the narrow type.  */
887
NEON_OP(narrow_high_u8)
888
{
889
    T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
890
        | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
891
    FORCE_RET();
892
}
893

    
894
NEON_OP(narrow_high_u16)
895
{
896
    T0 = (T0 >> 16) | (T1 & 0xffff0000);
897
    FORCE_RET();
898
}
899

    
900
NEON_OP(narrow_high_round_u8)
901
{
902
    T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
903
        | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
904
    FORCE_RET();
905
}
906

    
907
NEON_OP(narrow_high_round_u16)
908
{
909
    T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
910
    FORCE_RET();
911
}
912

    
913
NEON_OP(narrow_high_round_u32)
914
{
915
    if (T0 >= 0x80000000u)
916
        T0 = T1 + 1;
917
    else
918
        T0 = T1;
919
    FORCE_RET();
920
}
921

    
922
/* Widening instructions.  Named type is source type.  */
923
NEON_OP(widen_s8)
924
{
925
    uint32_t src;
926

    
927
    src = T0;
928
    T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
929
    T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
930
}
931

    
932
NEON_OP(widen_u8)
933
{
934
    T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
935
    T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
936
}
937

    
938
NEON_OP(widen_s16)
939
{
940
    int32_t src;
941

    
942
    src = T0;
943
    T0 = (int16_t)src;
944
    T1 = src >> 16;
945
}
946

    
947
NEON_OP(widen_u16)
948
{
949
    T1 = T0 >> 16;
950
    T0 &= 0xffff;
951
}
952

    
953
NEON_OP(widen_s32)
954
{
955
    T1 = (int32_t)T0 >> 31;
956
    FORCE_RET();
957
}
958

    
959
NEON_OP(widen_high_u8)
960
{
961
    T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
962
    T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
963
}
964

    
965
NEON_OP(widen_high_u16)
966
{
967
    T1 = T0 & 0xffff0000;
968
    T0 <<= 16;
969
}
970

    
971
/* Long operations.  The type is the wide type.  */
972
NEON_OP(shll_u16)
973
{
974
    int shift = PARAM1;
975
    uint32_t mask;
976

    
977
    mask = 0xffff >> (16 - shift);
978
    mask |= mask << 16;
979
    mask = ~mask;
980

    
981
    T0 = (T0 << shift) & mask;
982
    T1 = (T1 << shift) & mask;
983
    FORCE_RET();
984
}
985

    
986
NEON_OP(shll_u64)
987
{
988
    int shift = PARAM1;
989

    
990
    T1 <<= shift;
991
    T1 |= T0 >> (32 - shift);
992
    T0 <<= shift;
993
    FORCE_RET();
994
}
995

    
996
NEON_OP(addl_u16)
997
{
998
    uint32_t tmp;
999
    uint32_t high;
1000

    
1001
    tmp = env->vfp.scratch[0];
1002
    high = (T0 >> 16) + (tmp >> 16);
1003
    T0 = (uint16_t)(T0 + tmp);
1004
    T0 |= (high << 16);
1005
    tmp = env->vfp.scratch[1];
1006
    high = (T1 >> 16) + (tmp >> 16);
1007
    T1 = (uint16_t)(T1 + tmp);
1008
    T1 |= (high << 16);
1009
    FORCE_RET();
1010
}
1011

    
1012
NEON_OP(addl_u32)
1013
{
1014
    T0 += env->vfp.scratch[0];
1015
    T1 += env->vfp.scratch[1];
1016
    FORCE_RET();
1017
}
1018

    
1019
NEON_OP(addl_u64)
1020
{
1021
    uint64_t tmp;
1022
    tmp = T0 | ((uint64_t)T1 << 32);
1023
    tmp += env->vfp.scratch[0];
1024
    tmp += (uint64_t)env->vfp.scratch[1] << 32;
1025
    T0 = tmp;
1026
    T1 = tmp >> 32;
1027
    FORCE_RET();
1028
}
1029

    
1030
NEON_OP(subl_u16)
1031
{
1032
    uint32_t tmp;
1033
    uint32_t high;
1034

    
1035
    tmp = env->vfp.scratch[0];
1036
    high = (T0 >> 16) - (tmp >> 16);
1037
    T0 = (uint16_t)(T0 - tmp);
1038
    T0 |= (high << 16);
1039
    tmp = env->vfp.scratch[1];
1040
    high = (T1 >> 16) - (tmp >> 16);
1041
    T1 = (uint16_t)(T1 - tmp);
1042
    T1 |= (high << 16);
1043
    FORCE_RET();
1044
}
1045

    
1046
NEON_OP(subl_u32)
1047
{
1048
    T0 -= env->vfp.scratch[0];
1049
    T1 -= env->vfp.scratch[1];
1050
    FORCE_RET();
1051
}
1052

    
1053
NEON_OP(subl_u64)
1054
{
1055
    uint64_t tmp;
1056
    tmp = T0 | ((uint64_t)T1 << 32);
1057
    tmp -= env->vfp.scratch[0];
1058
    tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1059
    T0 = tmp;
1060
    T1 = tmp >> 32;
1061
    FORCE_RET();
1062
}
1063

    
1064
#define DO_ABD(dest, x, y, type) do { \
1065
    type tmp_x = x; \
1066
    type tmp_y = y; \
1067
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1068
    } while(0)
1069

    
1070
NEON_OP(abdl_u16)
1071
{
1072
    uint32_t tmp;
1073
    uint32_t low;
1074
    uint32_t high;
1075

    
1076
    DO_ABD(low, T0, T1, uint8_t);
1077
    DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1078
    low |= tmp << 16;
1079
    DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1080
    DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1081
    high |= tmp << 16;
1082
    T0 = low;
1083
    T1 = high;
1084
    FORCE_RET();
1085
}
1086

    
1087
NEON_OP(abdl_s16)
1088
{
1089
    uint32_t tmp;
1090
    uint32_t low;
1091
    uint32_t high;
1092

    
1093
    DO_ABD(low, T0, T1, int8_t);
1094
    DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1095
    low |= tmp << 16;
1096
    DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1097
    DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1098
    high |= tmp << 16;
1099
    T0 = low;
1100
    T1 = high;
1101
    FORCE_RET();
1102
}
1103

    
1104
NEON_OP(abdl_u32)
1105
{
1106
    uint32_t low;
1107
    uint32_t high;
1108

    
1109
    DO_ABD(low, T0, T1, uint16_t);
1110
    DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1111
    T0 = low;
1112
    T1 = high;
1113
    FORCE_RET();
1114
}
1115

    
1116
NEON_OP(abdl_s32)
1117
{
1118
    uint32_t low;
1119
    uint32_t high;
1120

    
1121
    DO_ABD(low, T0, T1, int16_t);
1122
    DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1123
    T0 = low;
1124
    T1 = high;
1125
    FORCE_RET();
1126
}
1127

    
1128
NEON_OP(abdl_u64)
1129
{
1130
    DO_ABD(T0, T0, T1, uint32_t);
1131
    T1 = 0;
1132
}
1133

    
1134
NEON_OP(abdl_s64)
1135
{
1136
    DO_ABD(T0, T0, T1, int32_t);
1137
    T1 = 0;
1138
}
1139
#undef DO_ABD
1140

    
1141
/* Widening multiple. Named type is the source type.  */
1142
#define DO_MULL(dest, x, y, type1, type2) do { \
1143
    type1 tmp_x = x; \
1144
    type1 tmp_y = y; \
1145
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1146
    } while(0)
1147

    
1148
NEON_OP(mull_u8)
1149
{
1150
    uint32_t tmp;
1151
    uint32_t low;
1152
    uint32_t high;
1153

    
1154
    DO_MULL(low, T0, T1, uint8_t, uint16_t);
1155
    DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1156
    low |= tmp << 16;
1157
    DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1158
    DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1159
    high |= tmp << 16;
1160
    T0 = low;
1161
    T1 = high;
1162
    FORCE_RET();
1163
}
1164

    
1165
NEON_OP(mull_s8)
1166
{
1167
    uint32_t tmp;
1168
    uint32_t low;
1169
    uint32_t high;
1170

    
1171
    DO_MULL(low, T0, T1, int8_t, uint16_t);
1172
    DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1173
    low |= tmp << 16;
1174
    DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1175
    DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1176
    high |= tmp << 16;
1177
    T0 = low;
1178
    T1 = high;
1179
    FORCE_RET();
1180
}
1181

    
1182
NEON_OP(mull_u16)
1183
{
1184
    uint32_t low;
1185
    uint32_t high;
1186

    
1187
    DO_MULL(low, T0, T1, uint16_t, uint32_t);
1188
    DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1189
    T0 = low;
1190
    T1 = high;
1191
    FORCE_RET();
1192
}
1193

    
1194
NEON_OP(mull_s16)
1195
{
1196
    uint32_t low;
1197
    uint32_t high;
1198

    
1199
    DO_MULL(low, T0, T1, int16_t, uint32_t);
1200
    DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1201
    T0 = low;
1202
    T1 = high;
1203
    FORCE_RET();
1204
}
1205

    
1206
NEON_OP(addl_saturate_s32)
1207
{
1208
    uint32_t tmp;
1209
    uint32_t res;
1210

    
1211
    tmp = env->vfp.scratch[0];
1212
    res = T0 + tmp;
1213
    if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1214
        env->QF = 1;
1215
        T0 = (T0 >> 31) ^ 0x7fffffff;
1216
    } else {
1217
      T0 = res;
1218
    }
1219
    tmp = env->vfp.scratch[1];
1220
    res = T1 + tmp;
1221
    if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1222
        env->QF = 1;
1223
        T1 = (T1 >> 31) ^ 0x7fffffff;
1224
    } else {
1225
      T1 = res;
1226
    }
1227
    FORCE_RET();
1228
}
1229

    
1230
NEON_OP(addl_saturate_s64)
1231
{
1232
    uint64_t src1;
1233
    uint64_t src2;
1234
    uint64_t res;
1235

    
1236
    src1 = T0 + ((uint64_t)T1 << 32);
1237
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1238
    res = src1 + src2;
1239
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1240
        env->QF = 1;
1241
        T0 = ~(int64_t)src1 >> 63;
1242
        T1 = T0 ^ 0x80000000;
1243
    } else {
1244
      T0 = res;
1245
      T1 = res >> 32;
1246
    }
1247
    FORCE_RET();
1248
}
1249

    
1250
NEON_OP(addl_saturate_u64)
1251
{
1252
    uint64_t src1;
1253
    uint64_t src2;
1254
    uint64_t res;
1255

    
1256
    src1 = T0 + ((uint64_t)T1 << 32);
1257
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1258
    res = src1 + src2;
1259
    if (res < src1) {
1260
        env->QF = 1;
1261
        T0 = 0xffffffff;
1262
        T1 = 0xffffffff;
1263
    } else {
1264
      T0 = res;
1265
      T1 = res >> 32;
1266
    }
1267
    FORCE_RET();
1268
}
1269

    
1270
NEON_OP(subl_saturate_s64)
1271
{
1272
    uint64_t src1;
1273
    uint64_t src2;
1274
    uint64_t res;
1275

    
1276
    src1 = T0 + ((uint64_t)T1 << 32);
1277
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1278
    res = src1 - src2;
1279
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1280
        env->QF = 1;
1281
        T0 = ~(int64_t)src1 >> 63;
1282
        T1 = T0 ^ 0x80000000;
1283
    } else {
1284
      T0 = res;
1285
      T1 = res >> 32;
1286
    }
1287
    FORCE_RET();
1288
}
1289

    
1290
NEON_OP(subl_saturate_u64)
1291
{
1292
    uint64_t src1;
1293
    uint64_t src2;
1294
    uint64_t res;
1295

    
1296
    src1 = T0 + ((uint64_t)T1 << 32);
1297
    src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1298
    if (src1 < src2) {
1299
        env->QF = 1;
1300
        T0 = 0;
1301
        T1 = 0;
1302
    } else {
1303
      res = src1 - src2;
1304
      T0 = res;
1305
      T1 = res >> 32;
1306
    }
1307
    FORCE_RET();
1308
}
1309

    
1310
NEON_OP(negl_u16)
1311
{
1312
    uint32_t tmp;
1313
    tmp = T0 >> 16;
1314
    tmp = -tmp;
1315
    T0 = (-T0 & 0xffff) | (tmp << 16);
1316
    tmp = T1 >> 16;
1317
    tmp = -tmp;
1318
    T1 = (-T1 & 0xffff) | (tmp << 16);
1319
    FORCE_RET();
1320
}
1321

    
1322
NEON_OP(negl_u32)
1323
{
1324
    T0 = -T0;
1325
    T1 = -T1;
1326
    FORCE_RET();
1327
}
1328

    
1329
NEON_OP(negl_u64)
1330
{
1331
    uint64_t val;
1332

    
1333
    val = T0 | ((uint64_t)T1 << 32);
1334
    val = -val;
1335
    T0 = val;
1336
    T1 = val >> 32;
1337
    FORCE_RET();
1338
}
1339

    
1340
/* Scalar operations.  */
1341
NEON_OP(dup_low16)
1342
{
1343
    T0 = (T0 & 0xffff) | (T0 << 16);
1344
    FORCE_RET();
1345
}
1346

    
1347
NEON_OP(dup_high16)
1348
{
1349
    T0 = (T0 >> 16) | (T0 & 0xffff0000);
1350
    FORCE_RET();
1351
}
1352

    
1353
/* Helper for VEXT */
1354
NEON_OP(extract)
1355
{
1356
    int shift = PARAM1;
1357
    T0 = (T0 >> shift) | (T1 << (32 - shift));
1358
    FORCE_RET();
1359
}
1360

    
1361
/* Pairwise add long.  Named type is source type.  */
1362
NEON_OP(paddl_s8)
1363
{
1364
    int8_t src1;
1365
    int8_t src2;
1366
    uint16_t result;
1367
    src1 = T0 >> 24;
1368
    src2 = T0 >> 16;
1369
    result = (uint16_t)src1 + src2;
1370
    src1 = T0 >> 8;
1371
    src2 = T0;
1372
    T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1373
    FORCE_RET();
1374
}
1375

    
1376
NEON_OP(paddl_u8)
1377
{
1378
    uint8_t src1;
1379
    uint8_t src2;
1380
    uint16_t result;
1381
    src1 = T0 >> 24;
1382
    src2 = T0 >> 16;
1383
    result = (uint16_t)src1 + src2;
1384
    src1 = T0 >> 8;
1385
    src2 = T0;
1386
    T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1387
    FORCE_RET();
1388
}
1389

    
1390
NEON_OP(paddl_s16)
1391
{
1392
    T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1393
    FORCE_RET();
1394
}
1395

    
1396
NEON_OP(paddl_u16)
1397
{
1398
    T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1399
    FORCE_RET();
1400
}
1401

    
1402
NEON_OP(paddl_s32)
1403
{
1404
    int64_t tmp;
1405
    tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1406
    T0 = tmp;
1407
    T1 = tmp >> 32;
1408
    FORCE_RET();
1409
}
1410

    
1411
NEON_OP(paddl_u32)
1412
{
1413
    uint64_t tmp;
1414
    tmp = (uint64_t)T0 + (uint64_t)T1;
1415
    T0 = tmp;
1416
    T1 = tmp >> 32;
1417
    FORCE_RET();
1418
}
1419

    
1420
/* Count Leading Sign/Zero Bits.  */
1421
static inline int do_clz8(uint8_t x)
1422
{
1423
    int n;
1424
    for (n = 8; x; n--)
1425
        x >>= 1;
1426
    return n;
1427
}
1428

    
1429
static inline int do_clz16(uint16_t x)
1430
{
1431
    int n;
1432
    for (n = 16; x; n--)
1433
        x >>= 1;
1434
    return n;
1435
}
1436

    
1437
NEON_OP(clz_u8)
1438
{
1439
    uint32_t result;
1440
    uint32_t tmp;
1441

    
1442
    tmp = T0;
1443
    result = do_clz8(tmp);
1444
    result |= do_clz8(tmp >> 8) << 8;
1445
    result |= do_clz8(tmp >> 16) << 16;
1446
    result |= do_clz8(tmp >> 24) << 24;
1447
    T0 = result;
1448
    FORCE_RET();
1449
}
1450

    
1451
NEON_OP(clz_u16)
1452
{
1453
    uint32_t result;
1454
    uint32_t tmp;
1455
    tmp = T0;
1456
    result = do_clz16(tmp);
1457
    result |= do_clz16(tmp >> 16) << 16;
1458
    T0 = result;
1459
    FORCE_RET();
1460
}
1461

    
1462
NEON_OP(cls_s8)
1463
{
1464
    uint32_t result;
1465
    int8_t tmp;
1466
    tmp = T0;
1467
    result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1468
    tmp = T0 >> 8;
1469
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1470
    tmp = T0 >> 16;
1471
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1472
    tmp = T0 >> 24;
1473
    result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1474
    T0 = result;
1475
    FORCE_RET();
1476
}
1477

    
1478
NEON_OP(cls_s16)
1479
{
1480
    uint32_t result;
1481
    int16_t tmp;
1482
    tmp = T0;
1483
    result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1484
    tmp = T0 >> 16;
1485
    result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1486
    T0 = result;
1487
    FORCE_RET();
1488
}
1489

    
1490
NEON_OP(cls_s32)
1491
{
1492
    int count;
1493
    if ((int32_t)T0 < 0)
1494
        T0 = ~T0;
1495
    for (count = 32; T0 > 0; count--)
1496
        T0 = T0 >> 1;
1497
    T0 = count - 1;
1498
    FORCE_RET();
1499
}
1500

    
1501
/* Bit count.  */
1502
NEON_OP(cnt_u8)
1503
{
1504
    T0 = (T0 & 0x55555555) + ((T0 >>  1) & 0x55555555);
1505
    T0 = (T0 & 0x33333333) + ((T0 >>  2) & 0x33333333);
1506
    T0 = (T0 & 0x0f0f0f0f) + ((T0 >>  4) & 0x0f0f0f0f);
1507
    FORCE_RET();
1508
}
1509

    
1510
/* Saturnating negation.  */
1511
/* ??? Make these use NEON_VOP1 */
1512
#define DO_QABS8(x) do { \
1513
    if (x == (int8_t)0x80) { \
1514
        x = 0x7f; \
1515
        env->QF = 1; \
1516
    } else if (x < 0) { \
1517
        x = -x; \
1518
    }} while (0)
1519
NEON_OP(qabs_s8)
1520
{
1521
    neon_s8 vec;
1522
    NEON_UNPACK(neon_s8, vec, T0);
1523
    DO_QABS8(vec.v1);
1524
    DO_QABS8(vec.v2);
1525
    DO_QABS8(vec.v3);
1526
    DO_QABS8(vec.v4);
1527
    NEON_PACK(neon_s8, T0, vec);
1528
    FORCE_RET();
1529
}
1530
#undef DO_QABS8
1531

    
1532
#define DO_QNEG8(x) do { \
1533
    if (x == (int8_t)0x80) { \
1534
        x = 0x7f; \
1535
        env->QF = 1; \
1536
    } else { \
1537
        x = -x; \
1538
    }} while (0)
1539
NEON_OP(qneg_s8)
1540
{
1541
    neon_s8 vec;
1542
    NEON_UNPACK(neon_s8, vec, T0);
1543
    DO_QNEG8(vec.v1);
1544
    DO_QNEG8(vec.v2);
1545
    DO_QNEG8(vec.v3);
1546
    DO_QNEG8(vec.v4);
1547
    NEON_PACK(neon_s8, T0, vec);
1548
    FORCE_RET();
1549
}
1550
#undef DO_QNEG8
1551

    
1552
#define DO_QABS16(x) do { \
1553
    if (x == (int16_t)0x8000) { \
1554
        x = 0x7fff; \
1555
        env->QF = 1; \
1556
    } else if (x < 0) { \
1557
        x = -x; \
1558
    }} while (0)
1559
NEON_OP(qabs_s16)
1560
{
1561
    neon_s16 vec;
1562
    NEON_UNPACK(neon_s16, vec, T0);
1563
    DO_QABS16(vec.v1);
1564
    DO_QABS16(vec.v2);
1565
    NEON_PACK(neon_s16, T0, vec);
1566
    FORCE_RET();
1567
}
1568
#undef DO_QABS16
1569

    
1570
#define DO_QNEG16(x) do { \
1571
    if (x == (int16_t)0x8000) { \
1572
        x = 0x7fff; \
1573
        env->QF = 1; \
1574
    } else { \
1575
        x = -x; \
1576
    }} while (0)
1577
NEON_OP(qneg_s16)
1578
{
1579
    neon_s16 vec;
1580
    NEON_UNPACK(neon_s16, vec, T0);
1581
    DO_QNEG16(vec.v1);
1582
    DO_QNEG16(vec.v2);
1583
    NEON_PACK(neon_s16, T0, vec);
1584
    FORCE_RET();
1585
}
1586
#undef DO_QNEG16
1587

    
1588
NEON_OP(qabs_s32)
1589
{
1590
    if (T0 == 0x80000000) {
1591
        T0 = 0x7fffffff;
1592
        env->QF = 1;
1593
    } else if ((int32_t)T0 < 0) {
1594
        T0 = -T0;
1595
    }
1596
    FORCE_RET();
1597
}
1598

    
1599
NEON_OP(qneg_s32)
1600
{
1601
    if (T0 == 0x80000000) {
1602
        T0 = 0x7fffffff;
1603
        env->QF = 1;
1604
    } else {
1605
        T0 = -T0;
1606
    }
1607
    FORCE_RET();
1608
}
1609

    
1610
/* Unary opperations */
1611
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1612
NEON_VOP1(abs_s8, neon_s8, 4)
1613
NEON_VOP1(abs_s16, neon_s16, 2)
1614
NEON_OP(abs_s32)
1615
{
1616
    if ((int32_t)T0 < 0)
1617
        T0 = -T0;
1618
    FORCE_RET();
1619
}
1620
#undef NEON_FN
1621

    
1622
/* Transpose.  Argument order is rather strange to avoid special casing
1623
   the tranlation code.
1624
   On input T0 = rm, T1 = rd.  On output T0 = rd, T1 = rm  */
1625
NEON_OP(trn_u8)
1626
{
1627
    uint32_t rd;
1628
    uint32_t rm;
1629
    rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1630
    rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1631
    T0 = rd;
1632
    T1 = rm;
1633
    FORCE_RET();
1634
}
1635

    
1636
NEON_OP(trn_u16)
1637
{
1638
    uint32_t rd;
1639
    uint32_t rm;
1640
    rd = (T0 << 16) | (T1 & 0xffff);
1641
    rm = (T1 >> 16) | (T0 & 0xffff0000);
1642
    T0 = rd;
1643
    T1 = rm;
1644
    FORCE_RET();
1645
}
1646

    
1647
/* Worker routines for zip and unzip.  */
1648
NEON_OP(unzip_u8)
1649
{
1650
    uint32_t rd;
1651
    uint32_t rm;
1652
    rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1653
         | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1654
    rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1655
         | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1656
    T0 = rd;
1657
    T1 = rm;
1658
    FORCE_RET();
1659
}
1660

    
1661
NEON_OP(zip_u8)
1662
{
1663
    uint32_t rd;
1664
    uint32_t rm;
1665
    rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1666
         | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1667
    rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1668
         | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1669
    T0 = rd;
1670
    T1 = rm;
1671
    FORCE_RET();
1672
}
1673

    
1674
NEON_OP(zip_u16)
1675
{
1676
    uint32_t tmp;
1677

    
1678
    tmp = (T0 & 0xffff) | (T1 << 16);
1679
    T1 = (T1 & 0xffff0000) | (T0 >> 16);
1680
    T0 = tmp;
1681
    FORCE_RET();
1682
}
1683

    
1684
NEON_OP(dup_u8)
1685
{
1686
    T0 = (T0 >> PARAM1) & 0xff;
1687
    T0 |= T0 << 8;
1688
    T0 |= T0 << 16;
1689
    FORCE_RET();
1690
}