Statistics
| Branch: | Revision:

root / target-arm / neon_helper.c @ e1d177b9

History | View | Annotate | Download (52.8 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007, 2008 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GNU GPL v2.
8
 */
9
#include <stdlib.h>
10
#include <stdio.h>
11

    
12
#include "cpu.h"
13
#include "exec-all.h"
14
#include "helpers.h"
15

    
16
#define SIGNBIT (uint32_t)0x80000000
17
#define SIGNBIT64 ((uint64_t)1 << 63)
18

    
19
#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
20

    
21
static float_status neon_float_status;
22
#define NFS &neon_float_status
23

    
24
/* Helper routines to perform bitwise copies between float and int.  */
25
static inline float32 vfp_itos(uint32_t i)
26
{
27
    union {
28
        uint32_t i;
29
        float32 s;
30
    } v;
31

    
32
    v.i = i;
33
    return v.s;
34
}
35

    
36
static inline uint32_t vfp_stoi(float32 s)
37
{
38
    union {
39
        uint32_t i;
40
        float32 s;
41
    } v;
42

    
43
    v.s = s;
44
    return v.i;
45
}
46

    
47
#define NEON_TYPE1(name, type) \
48
typedef struct \
49
{ \
50
    type v1; \
51
} neon_##name;
52
#ifdef HOST_WORDS_BIGENDIAN
53
#define NEON_TYPE2(name, type) \
54
typedef struct \
55
{ \
56
    type v2; \
57
    type v1; \
58
} neon_##name;
59
#define NEON_TYPE4(name, type) \
60
typedef struct \
61
{ \
62
    type v4; \
63
    type v3; \
64
    type v2; \
65
    type v1; \
66
} neon_##name;
67
#else
68
#define NEON_TYPE2(name, type) \
69
typedef struct \
70
{ \
71
    type v1; \
72
    type v2; \
73
} neon_##name;
74
#define NEON_TYPE4(name, type) \
75
typedef struct \
76
{ \
77
    type v1; \
78
    type v2; \
79
    type v3; \
80
    type v4; \
81
} neon_##name;
82
#endif
83

    
84
NEON_TYPE4(s8, int8_t)
85
NEON_TYPE4(u8, uint8_t)
86
NEON_TYPE2(s16, int16_t)
87
NEON_TYPE2(u16, uint16_t)
88
NEON_TYPE1(s32, int32_t)
89
NEON_TYPE1(u32, uint32_t)
90
#undef NEON_TYPE4
91
#undef NEON_TYPE2
92
#undef NEON_TYPE1
93

    
94
/* Copy from a uint32_t to a vector structure type.  */
95
#define NEON_UNPACK(vtype, dest, val) do { \
96
    union { \
97
        vtype v; \
98
        uint32_t i; \
99
    } conv_u; \
100
    conv_u.i = (val); \
101
    dest = conv_u.v; \
102
    } while(0)
103

    
104
/* Copy from a vector structure type to a uint32_t.  */
105
#define NEON_PACK(vtype, dest, val) do { \
106
    union { \
107
        vtype v; \
108
        uint32_t i; \
109
    } conv_u; \
110
    conv_u.v = (val); \
111
    dest = conv_u.i; \
112
    } while(0)
113

    
114
#define NEON_DO1 \
115
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
116
#define NEON_DO2 \
117
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
118
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
119
#define NEON_DO4 \
120
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
121
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
122
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
123
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
124

    
125
#define NEON_VOP_BODY(vtype, n) \
126
{ \
127
    uint32_t res; \
128
    vtype vsrc1; \
129
    vtype vsrc2; \
130
    vtype vdest; \
131
    NEON_UNPACK(vtype, vsrc1, arg1); \
132
    NEON_UNPACK(vtype, vsrc2, arg2); \
133
    NEON_DO##n; \
134
    NEON_PACK(vtype, res, vdest); \
135
    return res; \
136
}
137

    
138
#define NEON_VOP(name, vtype, n) \
139
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
140
NEON_VOP_BODY(vtype, n)
141

    
142
#define NEON_VOP_ENV(name, vtype, n) \
143
uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
144
NEON_VOP_BODY(vtype, n)
145

    
146
/* Pairwise operations.  */
147
/* For 32-bit elements each segment only contains a single element, so
148
   the elementwise and pairwise operations are the same.  */
149
#define NEON_PDO2 \
150
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
151
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
152
#define NEON_PDO4 \
153
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
154
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
155
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
156
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
157

    
158
#define NEON_POP(name, vtype, n) \
159
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
160
{ \
161
    uint32_t res; \
162
    vtype vsrc1; \
163
    vtype vsrc2; \
164
    vtype vdest; \
165
    NEON_UNPACK(vtype, vsrc1, arg1); \
166
    NEON_UNPACK(vtype, vsrc2, arg2); \
167
    NEON_PDO##n; \
168
    NEON_PACK(vtype, res, vdest); \
169
    return res; \
170
}
171

    
172
/* Unary operators.  */
173
#define NEON_VOP1(name, vtype, n) \
174
uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
175
{ \
176
    vtype vsrc1; \
177
    vtype vdest; \
178
    NEON_UNPACK(vtype, vsrc1, arg); \
179
    NEON_DO##n; \
180
    NEON_PACK(vtype, arg, vdest); \
181
    return arg; \
182
}
183

    
184

    
185
#define NEON_USAT(dest, src1, src2, type) do { \
186
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
187
    if (tmp != (type)tmp) { \
188
        SET_QC(); \
189
        dest = ~0; \
190
    } else { \
191
        dest = tmp; \
192
    }} while(0)
193
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
194
NEON_VOP_ENV(qadd_u8, neon_u8, 4)
195
#undef NEON_FN
196
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
197
NEON_VOP_ENV(qadd_u16, neon_u16, 2)
198
#undef NEON_FN
199
#undef NEON_USAT
200

    
201
uint32_t HELPER(neon_qadd_u32)(CPUState *env, uint32_t a, uint32_t b)
202
{
203
    uint32_t res = a + b;
204
    if (res < a) {
205
        SET_QC();
206
        res = ~0;
207
    }
208
    return res;
209
}
210

    
211
uint64_t HELPER(neon_qadd_u64)(CPUState *env, uint64_t src1, uint64_t src2)
212
{
213
    uint64_t res;
214

    
215
    res = src1 + src2;
216
    if (res < src1) {
217
        SET_QC();
218
        res = ~(uint64_t)0;
219
    }
220
    return res;
221
}
222

    
223
#define NEON_SSAT(dest, src1, src2, type) do { \
224
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
225
    if (tmp != (type)tmp) { \
226
        SET_QC(); \
227
        if (src2 > 0) { \
228
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
229
        } else { \
230
            tmp = 1 << (sizeof(type) * 8 - 1); \
231
        } \
232
    } \
233
    dest = tmp; \
234
    } while(0)
235
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
236
NEON_VOP_ENV(qadd_s8, neon_s8, 4)
237
#undef NEON_FN
238
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
239
NEON_VOP_ENV(qadd_s16, neon_s16, 2)
240
#undef NEON_FN
241
#undef NEON_SSAT
242

    
243
uint32_t HELPER(neon_qadd_s32)(CPUState *env, uint32_t a, uint32_t b)
244
{
245
    uint32_t res = a + b;
246
    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
247
        SET_QC();
248
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
249
    }
250
    return res;
251
}
252

    
253
uint64_t HELPER(neon_qadd_s64)(CPUState *env, uint64_t src1, uint64_t src2)
254
{
255
    uint64_t res;
256

    
257
    res = src1 + src2;
258
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
259
        SET_QC();
260
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
261
    }
262
    return res;
263
}
264

    
265
#define NEON_USAT(dest, src1, src2, type) do { \
266
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
267
    if (tmp != (type)tmp) { \
268
        SET_QC(); \
269
        dest = 0; \
270
    } else { \
271
        dest = tmp; \
272
    }} while(0)
273
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
274
NEON_VOP_ENV(qsub_u8, neon_u8, 4)
275
#undef NEON_FN
276
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
277
NEON_VOP_ENV(qsub_u16, neon_u16, 2)
278
#undef NEON_FN
279
#undef NEON_USAT
280

    
281
uint32_t HELPER(neon_qsub_u32)(CPUState *env, uint32_t a, uint32_t b)
282
{
283
    uint32_t res = a - b;
284
    if (res > a) {
285
        SET_QC();
286
        res = 0;
287
    }
288
    return res;
289
}
290

    
291
uint64_t HELPER(neon_qsub_u64)(CPUState *env, uint64_t src1, uint64_t src2)
292
{
293
    uint64_t res;
294

    
295
    if (src1 < src2) {
296
        SET_QC();
297
        res = 0;
298
    } else {
299
        res = src1 - src2;
300
    }
301
    return res;
302
}
303

    
304
#define NEON_SSAT(dest, src1, src2, type) do { \
305
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
306
    if (tmp != (type)tmp) { \
307
        SET_QC(); \
308
        if (src2 < 0) { \
309
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
310
        } else { \
311
            tmp = 1 << (sizeof(type) * 8 - 1); \
312
        } \
313
    } \
314
    dest = tmp; \
315
    } while(0)
316
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
317
NEON_VOP_ENV(qsub_s8, neon_s8, 4)
318
#undef NEON_FN
319
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
320
NEON_VOP_ENV(qsub_s16, neon_s16, 2)
321
#undef NEON_FN
322
#undef NEON_SSAT
323

    
324
uint32_t HELPER(neon_qsub_s32)(CPUState *env, uint32_t a, uint32_t b)
325
{
326
    uint32_t res = a - b;
327
    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
328
        SET_QC();
329
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
330
    }
331
    return res;
332
}
333

    
334
uint64_t HELPER(neon_qsub_s64)(CPUState *env, uint64_t src1, uint64_t src2)
335
{
336
    uint64_t res;
337

    
338
    res = src1 - src2;
339
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
340
        SET_QC();
341
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
342
    }
343
    return res;
344
}
345

    
346
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
347
NEON_VOP(hadd_s8, neon_s8, 4)
348
NEON_VOP(hadd_u8, neon_u8, 4)
349
NEON_VOP(hadd_s16, neon_s16, 2)
350
NEON_VOP(hadd_u16, neon_u16, 2)
351
#undef NEON_FN
352

    
353
int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
354
{
355
    int32_t dest;
356

    
357
    dest = (src1 >> 1) + (src2 >> 1);
358
    if (src1 & src2 & 1)
359
        dest++;
360
    return dest;
361
}
362

    
363
uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
364
{
365
    uint32_t dest;
366

    
367
    dest = (src1 >> 1) + (src2 >> 1);
368
    if (src1 & src2 & 1)
369
        dest++;
370
    return dest;
371
}
372

    
373
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
374
NEON_VOP(rhadd_s8, neon_s8, 4)
375
NEON_VOP(rhadd_u8, neon_u8, 4)
376
NEON_VOP(rhadd_s16, neon_s16, 2)
377
NEON_VOP(rhadd_u16, neon_u16, 2)
378
#undef NEON_FN
379

    
380
int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
381
{
382
    int32_t dest;
383

    
384
    dest = (src1 >> 1) + (src2 >> 1);
385
    if ((src1 | src2) & 1)
386
        dest++;
387
    return dest;
388
}
389

    
390
uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
391
{
392
    uint32_t dest;
393

    
394
    dest = (src1 >> 1) + (src2 >> 1);
395
    if ((src1 | src2) & 1)
396
        dest++;
397
    return dest;
398
}
399

    
400
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
401
NEON_VOP(hsub_s8, neon_s8, 4)
402
NEON_VOP(hsub_u8, neon_u8, 4)
403
NEON_VOP(hsub_s16, neon_s16, 2)
404
NEON_VOP(hsub_u16, neon_u16, 2)
405
#undef NEON_FN
406

    
407
int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
408
{
409
    int32_t dest;
410

    
411
    dest = (src1 >> 1) - (src2 >> 1);
412
    if ((~src1) & src2 & 1)
413
        dest--;
414
    return dest;
415
}
416

    
417
uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
418
{
419
    uint32_t dest;
420

    
421
    dest = (src1 >> 1) - (src2 >> 1);
422
    if ((~src1) & src2 & 1)
423
        dest--;
424
    return dest;
425
}
426

    
427
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
428
NEON_VOP(cgt_s8, neon_s8, 4)
429
NEON_VOP(cgt_u8, neon_u8, 4)
430
NEON_VOP(cgt_s16, neon_s16, 2)
431
NEON_VOP(cgt_u16, neon_u16, 2)
432
NEON_VOP(cgt_s32, neon_s32, 1)
433
NEON_VOP(cgt_u32, neon_u32, 1)
434
#undef NEON_FN
435

    
436
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
437
NEON_VOP(cge_s8, neon_s8, 4)
438
NEON_VOP(cge_u8, neon_u8, 4)
439
NEON_VOP(cge_s16, neon_s16, 2)
440
NEON_VOP(cge_u16, neon_u16, 2)
441
NEON_VOP(cge_s32, neon_s32, 1)
442
NEON_VOP(cge_u32, neon_u32, 1)
443
#undef NEON_FN
444

    
445
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
446
NEON_VOP(min_s8, neon_s8, 4)
447
NEON_VOP(min_u8, neon_u8, 4)
448
NEON_VOP(min_s16, neon_s16, 2)
449
NEON_VOP(min_u16, neon_u16, 2)
450
NEON_VOP(min_s32, neon_s32, 1)
451
NEON_VOP(min_u32, neon_u32, 1)
452
NEON_POP(pmin_s8, neon_s8, 4)
453
NEON_POP(pmin_u8, neon_u8, 4)
454
NEON_POP(pmin_s16, neon_s16, 2)
455
NEON_POP(pmin_u16, neon_u16, 2)
456
#undef NEON_FN
457

    
458
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
459
NEON_VOP(max_s8, neon_s8, 4)
460
NEON_VOP(max_u8, neon_u8, 4)
461
NEON_VOP(max_s16, neon_s16, 2)
462
NEON_VOP(max_u16, neon_u16, 2)
463
NEON_VOP(max_s32, neon_s32, 1)
464
NEON_VOP(max_u32, neon_u32, 1)
465
NEON_POP(pmax_s8, neon_s8, 4)
466
NEON_POP(pmax_u8, neon_u8, 4)
467
NEON_POP(pmax_s16, neon_s16, 2)
468
NEON_POP(pmax_u16, neon_u16, 2)
469
#undef NEON_FN
470

    
471
#define NEON_FN(dest, src1, src2) \
472
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
473
NEON_VOP(abd_s8, neon_s8, 4)
474
NEON_VOP(abd_u8, neon_u8, 4)
475
NEON_VOP(abd_s16, neon_s16, 2)
476
NEON_VOP(abd_u16, neon_u16, 2)
477
NEON_VOP(abd_s32, neon_s32, 1)
478
NEON_VOP(abd_u32, neon_u32, 1)
479
#undef NEON_FN
480

    
481
#define NEON_FN(dest, src1, src2) do { \
482
    int8_t tmp; \
483
    tmp = (int8_t)src2; \
484
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
485
        tmp <= -(ssize_t)sizeof(src1) * 8) { \
486
        dest = 0; \
487
    } else if (tmp < 0) { \
488
        dest = src1 >> -tmp; \
489
    } else { \
490
        dest = src1 << tmp; \
491
    }} while (0)
492
NEON_VOP(shl_u8, neon_u8, 4)
493
NEON_VOP(shl_u16, neon_u16, 2)
494
NEON_VOP(shl_u32, neon_u32, 1)
495
#undef NEON_FN
496

    
497
uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
498
{
499
    int8_t shift = (int8_t)shiftop;
500
    if (shift >= 64 || shift <= -64) {
501
        val = 0;
502
    } else if (shift < 0) {
503
        val >>= -shift;
504
    } else {
505
        val <<= shift;
506
    }
507
    return val;
508
}
509

    
510
#define NEON_FN(dest, src1, src2) do { \
511
    int8_t tmp; \
512
    tmp = (int8_t)src2; \
513
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
514
        dest = 0; \
515
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
516
        dest = src1 >> (sizeof(src1) * 8 - 1); \
517
    } else if (tmp < 0) { \
518
        dest = src1 >> -tmp; \
519
    } else { \
520
        dest = src1 << tmp; \
521
    }} while (0)
522
NEON_VOP(shl_s8, neon_s8, 4)
523
NEON_VOP(shl_s16, neon_s16, 2)
524
NEON_VOP(shl_s32, neon_s32, 1)
525
#undef NEON_FN
526

    
527
uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
528
{
529
    int8_t shift = (int8_t)shiftop;
530
    int64_t val = valop;
531
    if (shift >= 64) {
532
        val = 0;
533
    } else if (shift <= -64) {
534
        val >>= 63;
535
    } else if (shift < 0) {
536
        val >>= -shift;
537
    } else {
538
        val <<= shift;
539
    }
540
    return val;
541
}
542

    
543
#define NEON_FN(dest, src1, src2) do { \
544
    int8_t tmp; \
545
    tmp = (int8_t)src2; \
546
    if ((tmp >= (ssize_t)sizeof(src1) * 8) \
547
        || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
548
        dest = 0; \
549
    } else if (tmp < 0) { \
550
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
551
    } else { \
552
        dest = src1 << tmp; \
553
    }} while (0)
554
NEON_VOP(rshl_s8, neon_s8, 4)
555
NEON_VOP(rshl_s16, neon_s16, 2)
556
#undef NEON_FN
557

    
558
/* The addition of the rounding constant may overflow, so we use an
559
 * intermediate 64 bits accumulator.  */
560
uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
561
{
562
    int32_t dest;
563
    int32_t val = (int32_t)valop;
564
    int8_t shift = (int8_t)shiftop;
565
    if ((shift >= 32) || (shift <= -32)) {
566
        dest = 0;
567
    } else if (shift < 0) {
568
        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
569
        dest = big_dest >> -shift;
570
    } else {
571
        dest = val << shift;
572
    }
573
    return dest;
574
}
575

    
576
/* Handling addition overflow with 64 bits inputs values is more
577
 * tricky than with 32 bits values.  */
578
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
579
{
580
    int8_t shift = (int8_t)shiftop;
581
    int64_t val = valop;
582
    if ((shift >= 64) || (shift <= -64)) {
583
        val = 0;
584
    } else if (shift < 0) {
585
        val >>= (-shift - 1);
586
        if (val == INT64_MAX) {
587
            /* In this case, it means that the rounding constant is 1,
588
             * and the addition would overflow. Return the actual
589
             * result directly.  */
590
            val = 0x4000000000000000LL;
591
        } else {
592
            val++;
593
            val >>= 1;
594
        }
595
    } else {
596
        val <<= shift;
597
    }
598
    return val;
599
}
600

    
601
#define NEON_FN(dest, src1, src2) do { \
602
    int8_t tmp; \
603
    tmp = (int8_t)src2; \
604
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
605
        tmp < -(ssize_t)sizeof(src1) * 8) { \
606
        dest = 0; \
607
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
608
        dest = src1 >> (-tmp - 1); \
609
    } else if (tmp < 0) { \
610
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
611
    } else { \
612
        dest = src1 << tmp; \
613
    }} while (0)
614
NEON_VOP(rshl_u8, neon_u8, 4)
615
NEON_VOP(rshl_u16, neon_u16, 2)
616
#undef NEON_FN
617

    
618
/* The addition of the rounding constant may overflow, so we use an
619
 * intermediate 64 bits accumulator.  */
620
uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
621
{
622
    uint32_t dest;
623
    int8_t shift = (int8_t)shiftop;
624
    if (shift >= 32 || shift < -32) {
625
        dest = 0;
626
    } else if (shift == -32) {
627
        dest = val >> 31;
628
    } else if (shift < 0) {
629
        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
630
        dest = big_dest >> -shift;
631
    } else {
632
        dest = val << shift;
633
    }
634
    return dest;
635
}
636

    
637
/* Handling addition overflow with 64 bits inputs values is more
638
 * tricky than with 32 bits values.  */
639
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
640
{
641
    int8_t shift = (uint8_t)shiftop;
642
    if (shift >= 64 || shift < -64) {
643
        val = 0;
644
    } else if (shift == -64) {
645
        /* Rounding a 1-bit result just preserves that bit.  */
646
        val >>= 63;
647
    } else if (shift < 0) {
648
        val >>= (-shift - 1);
649
        if (val == UINT64_MAX) {
650
            /* In this case, it means that the rounding constant is 1,
651
             * and the addition would overflow. Return the actual
652
             * result directly.  */
653
            val = 0x8000000000000000ULL;
654
        } else {
655
            val++;
656
            val >>= 1;
657
        }
658
    } else {
659
        val <<= shift;
660
    }
661
    return val;
662
}
663

    
664
#define NEON_FN(dest, src1, src2) do { \
665
    int8_t tmp; \
666
    tmp = (int8_t)src2; \
667
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
668
        if (src1) { \
669
            SET_QC(); \
670
            dest = ~0; \
671
        } else { \
672
            dest = 0; \
673
        } \
674
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
675
        dest = 0; \
676
    } else if (tmp < 0) { \
677
        dest = src1 >> -tmp; \
678
    } else { \
679
        dest = src1 << tmp; \
680
        if ((dest >> tmp) != src1) { \
681
            SET_QC(); \
682
            dest = ~0; \
683
        } \
684
    }} while (0)
685
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
686
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
687
NEON_VOP_ENV(qshl_u32, neon_u32, 1)
688
#undef NEON_FN
689

    
690
uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
691
{
692
    int8_t shift = (int8_t)shiftop;
693
    if (shift >= 64) {
694
        if (val) {
695
            val = ~(uint64_t)0;
696
            SET_QC();
697
        }
698
    } else if (shift <= -64) {
699
        val = 0;
700
    } else if (shift < 0) {
701
        val >>= -shift;
702
    } else {
703
        uint64_t tmp = val;
704
        val <<= shift;
705
        if ((val >> shift) != tmp) {
706
            SET_QC();
707
            val = ~(uint64_t)0;
708
        }
709
    }
710
    return val;
711
}
712

    
713
#define NEON_FN(dest, src1, src2) do { \
714
    int8_t tmp; \
715
    tmp = (int8_t)src2; \
716
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
717
        if (src1) { \
718
            SET_QC(); \
719
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
720
            if (src1 > 0) { \
721
                dest--; \
722
            } \
723
        } else { \
724
            dest = src1; \
725
        } \
726
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
727
        dest = src1 >> 31; \
728
    } else if (tmp < 0) { \
729
        dest = src1 >> -tmp; \
730
    } else { \
731
        dest = src1 << tmp; \
732
        if ((dest >> tmp) != src1) { \
733
            SET_QC(); \
734
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
735
            if (src1 > 0) { \
736
                dest--; \
737
            } \
738
        } \
739
    }} while (0)
740
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
741
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
742
NEON_VOP_ENV(qshl_s32, neon_s32, 1)
743
#undef NEON_FN
744

    
745
uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
746
{
747
    int8_t shift = (uint8_t)shiftop;
748
    int64_t val = valop;
749
    if (shift >= 64) {
750
        if (val) {
751
            SET_QC();
752
            val = (val >> 63) ^ ~SIGNBIT64;
753
        }
754
    } else if (shift <= -64) {
755
        val >>= 63;
756
    } else if (shift < 0) {
757
        val >>= -shift;
758
    } else {
759
        int64_t tmp = val;
760
        val <<= shift;
761
        if ((val >> shift) != tmp) {
762
            SET_QC();
763
            val = (tmp >> 63) ^ ~SIGNBIT64;
764
        }
765
    }
766
    return val;
767
}
768

    
769
#define NEON_FN(dest, src1, src2) do { \
770
    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
771
        SET_QC(); \
772
        dest = 0; \
773
    } else { \
774
        int8_t tmp; \
775
        tmp = (int8_t)src2; \
776
        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
777
            if (src1) { \
778
                SET_QC(); \
779
                dest = ~0; \
780
            } else { \
781
                dest = 0; \
782
            } \
783
        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
784
            dest = 0; \
785
        } else if (tmp < 0) { \
786
            dest = src1 >> -tmp; \
787
        } else { \
788
            dest = src1 << tmp; \
789
            if ((dest >> tmp) != src1) { \
790
                SET_QC(); \
791
                dest = ~0; \
792
            } \
793
        } \
794
    }} while (0)
795
NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
796
NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
797
#undef NEON_FN
798

    
799
uint32_t HELPER(neon_qshlu_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
800
{
801
    if ((int32_t)valop < 0) {
802
        SET_QC();
803
        return 0;
804
    }
805
    return helper_neon_qshl_u32(env, valop, shiftop);
806
}
807

    
808
uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
809
{
810
    if ((int64_t)valop < 0) {
811
        SET_QC();
812
        return 0;
813
    }
814
    return helper_neon_qshl_u64(env, valop, shiftop);
815
}
816

    
817
/* FIXME: This is wrong.  */
818
#define NEON_FN(dest, src1, src2) do { \
819
    int8_t tmp; \
820
    tmp = (int8_t)src2; \
821
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
822
        if (src1) { \
823
            SET_QC(); \
824
            dest = ~0; \
825
        } else { \
826
            dest = 0; \
827
        } \
828
    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
829
        dest = 0; \
830
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
831
        dest = src1 >> (sizeof(src1) * 8 - 1); \
832
    } else if (tmp < 0) { \
833
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
834
    } else { \
835
        dest = src1 << tmp; \
836
        if ((dest >> tmp) != src1) { \
837
            SET_QC(); \
838
            dest = ~0; \
839
        } \
840
    }} while (0)
841
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
842
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
843
#undef NEON_FN
844

    
845
/* The addition of the rounding constant may overflow, so we use an
846
 * intermediate 64 bits accumulator.  */
847
uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop)
848
{
849
    uint32_t dest;
850
    int8_t shift = (int8_t)shiftop;
851
    if (shift >= 32) {
852
        if (val) {
853
            SET_QC();
854
            dest = ~0;
855
        } else {
856
            dest = 0;
857
        }
858
    } else if (shift < -32) {
859
        dest = 0;
860
    } else if (shift == -32) {
861
        dest = val >> 31;
862
    } else if (shift < 0) {
863
        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
864
        dest = big_dest >> -shift;
865
    } else {
866
        dest = val << shift;
867
        if ((dest >> shift) != val) {
868
            SET_QC();
869
            dest = ~0;
870
        }
871
    }
872
    return dest;
873
}
874

    
875
/* Handling addition overflow with 64 bits inputs values is more
876
 * tricky than with 32 bits values.  */
877
uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
878
{
879
    int8_t shift = (int8_t)shiftop;
880
    if (shift >= 64) {
881
        if (val) {
882
            SET_QC();
883
            val = ~0;
884
        }
885
    } else if (shift < -64) {
886
        val = 0;
887
    } else if (shift == -64) {
888
        val >>= 63;
889
    } else if (shift < 0) {
890
        val >>= (-shift - 1);
891
        if (val == UINT64_MAX) {
892
            /* In this case, it means that the rounding constant is 1,
893
             * and the addition would overflow. Return the actual
894
             * result directly.  */
895
            val = 0x8000000000000000ULL;
896
        } else {
897
            val++;
898
            val >>= 1;
899
        }
900
    } else { \
901
        uint64_t tmp = val;
902
        val <<= shift;
903
        if ((val >> shift) != tmp) {
904
            SET_QC();
905
            val = ~0;
906
        }
907
    }
908
    return val;
909
}
910

    
911
#define NEON_FN(dest, src1, src2) do { \
912
    int8_t tmp; \
913
    tmp = (int8_t)src2; \
914
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
915
        if (src1) { \
916
            SET_QC(); \
917
            dest = (1 << (sizeof(src1) * 8 - 1)); \
918
            if (src1 > 0) { \
919
                dest--; \
920
            } \
921
        } else { \
922
            dest = 0; \
923
        } \
924
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
925
        dest = 0; \
926
    } else if (tmp < 0) { \
927
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
928
    } else { \
929
        dest = src1 << tmp; \
930
        if ((dest >> tmp) != src1) { \
931
            SET_QC(); \
932
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
933
            if (src1 > 0) { \
934
                dest--; \
935
            } \
936
        } \
937
    }} while (0)
938
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
939
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
940
#undef NEON_FN
941

    
942
/* The addition of the rounding constant may overflow, so we use an
943
 * intermediate 64 bits accumulator.  */
944
uint32_t HELPER(neon_qrshl_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
945
{
946
    int32_t dest;
947
    int32_t val = (int32_t)valop;
948
    int8_t shift = (int8_t)shiftop;
949
    if (shift >= 32) {
950
        if (val) {
951
            SET_QC();
952
            dest = (val >> 31) ^ ~SIGNBIT;
953
        } else {
954
            dest = 0;
955
        }
956
    } else if (shift <= -32) {
957
        dest = 0;
958
    } else if (shift < 0) {
959
        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
960
        dest = big_dest >> -shift;
961
    } else {
962
        dest = val << shift;
963
        if ((dest >> shift) != val) {
964
            SET_QC();
965
            dest = (val >> 31) ^ ~SIGNBIT;
966
        }
967
    }
968
    return dest;
969
}
970

    
971
/* Handling addition overflow with 64 bits inputs values is more
972
 * tricky than with 32 bits values.  */
973
uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
974
{
975
    int8_t shift = (uint8_t)shiftop;
976
    int64_t val = valop;
977

    
978
    if (shift >= 64) {
979
        if (val) {
980
            SET_QC();
981
            val = (val >> 63) ^ ~SIGNBIT64;
982
        }
983
    } else if (shift <= -64) {
984
        val = 0;
985
    } else if (shift < 0) {
986
        val >>= (-shift - 1);
987
        if (val == INT64_MAX) {
988
            /* In this case, it means that the rounding constant is 1,
989
             * and the addition would overflow. Return the actual
990
             * result directly.  */
991
            val = 0x4000000000000000ULL;
992
        } else {
993
            val++;
994
            val >>= 1;
995
        }
996
    } else {
997
        int64_t tmp = val;
998
        val <<= shift;
999
        if ((val >> shift) != tmp) {
1000
            SET_QC();
1001
            val = (tmp >> 63) ^ ~SIGNBIT64;
1002
        }
1003
    }
1004
    return val;
1005
}
1006

    
1007
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
1008
{
1009
    uint32_t mask;
1010
    mask = (a ^ b) & 0x80808080u;
1011
    a &= ~0x80808080u;
1012
    b &= ~0x80808080u;
1013
    return (a + b) ^ mask;
1014
}
1015

    
1016
uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
1017
{
1018
    uint32_t mask;
1019
    mask = (a ^ b) & 0x80008000u;
1020
    a &= ~0x80008000u;
1021
    b &= ~0x80008000u;
1022
    return (a + b) ^ mask;
1023
}
1024

    
1025
#define NEON_FN(dest, src1, src2) dest = src1 + src2
1026
NEON_POP(padd_u8, neon_u8, 4)
1027
NEON_POP(padd_u16, neon_u16, 2)
1028
#undef NEON_FN
1029

    
1030
#define NEON_FN(dest, src1, src2) dest = src1 - src2
1031
NEON_VOP(sub_u8, neon_u8, 4)
1032
NEON_VOP(sub_u16, neon_u16, 2)
1033
#undef NEON_FN
1034

    
1035
#define NEON_FN(dest, src1, src2) dest = src1 * src2
1036
NEON_VOP(mul_u8, neon_u8, 4)
1037
NEON_VOP(mul_u16, neon_u16, 2)
1038
#undef NEON_FN
1039

    
1040
/* Polynomial multiplication is like integer multiplication except the
1041
   partial products are XORed, not added.  */
1042
uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
1043
{
1044
    uint32_t mask;
1045
    uint32_t result;
1046
    result = 0;
1047
    while (op1) {
1048
        mask = 0;
1049
        if (op1 & 1)
1050
            mask |= 0xff;
1051
        if (op1 & (1 << 8))
1052
            mask |= (0xff << 8);
1053
        if (op1 & (1 << 16))
1054
            mask |= (0xff << 16);
1055
        if (op1 & (1 << 24))
1056
            mask |= (0xff << 24);
1057
        result ^= op2 & mask;
1058
        op1 = (op1 >> 1) & 0x7f7f7f7f;
1059
        op2 = (op2 << 1) & 0xfefefefe;
1060
    }
1061
    return result;
1062
}
1063

    
1064
uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1065
{
1066
    uint64_t result = 0;
1067
    uint64_t mask;
1068
    uint64_t op2ex = op2;
1069
    op2ex = (op2ex & 0xff) |
1070
        ((op2ex & 0xff00) << 8) |
1071
        ((op2ex & 0xff0000) << 16) |
1072
        ((op2ex & 0xff000000) << 24);
1073
    while (op1) {
1074
        mask = 0;
1075
        if (op1 & 1) {
1076
            mask |= 0xffff;
1077
        }
1078
        if (op1 & (1 << 8)) {
1079
            mask |= (0xffffU << 16);
1080
        }
1081
        if (op1 & (1 << 16)) {
1082
            mask |= (0xffffULL << 32);
1083
        }
1084
        if (op1 & (1 << 24)) {
1085
            mask |= (0xffffULL << 48);
1086
        }
1087
        result ^= op2ex & mask;
1088
        op1 = (op1 >> 1) & 0x7f7f7f7f;
1089
        op2ex <<= 1;
1090
    }
1091
    return result;
1092
}
1093

    
1094
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
1095
NEON_VOP(tst_u8, neon_u8, 4)
1096
NEON_VOP(tst_u16, neon_u16, 2)
1097
NEON_VOP(tst_u32, neon_u32, 1)
1098
#undef NEON_FN
1099

    
1100
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
1101
NEON_VOP(ceq_u8, neon_u8, 4)
1102
NEON_VOP(ceq_u16, neon_u16, 2)
1103
NEON_VOP(ceq_u32, neon_u32, 1)
1104
#undef NEON_FN
1105

    
1106
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1107
NEON_VOP1(abs_s8, neon_s8, 4)
1108
NEON_VOP1(abs_s16, neon_s16, 2)
1109
#undef NEON_FN
1110

    
1111
/* Count Leading Sign/Zero Bits.  */
1112
static inline int do_clz8(uint8_t x)
1113
{
1114
    int n;
1115
    for (n = 8; x; n--)
1116
        x >>= 1;
1117
    return n;
1118
}
1119

    
1120
static inline int do_clz16(uint16_t x)
1121
{
1122
    int n;
1123
    for (n = 16; x; n--)
1124
        x >>= 1;
1125
    return n;
1126
}
1127

    
1128
#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
1129
NEON_VOP1(clz_u8, neon_u8, 4)
1130
#undef NEON_FN
1131

    
1132
#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
1133
NEON_VOP1(clz_u16, neon_u16, 2)
1134
#undef NEON_FN
1135

    
1136
#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
1137
NEON_VOP1(cls_s8, neon_s8, 4)
1138
#undef NEON_FN
1139

    
1140
#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
1141
NEON_VOP1(cls_s16, neon_s16, 2)
1142
#undef NEON_FN
1143

    
1144
uint32_t HELPER(neon_cls_s32)(uint32_t x)
1145
{
1146
    int count;
1147
    if ((int32_t)x < 0)
1148
        x = ~x;
1149
    for (count = 32; x; count--)
1150
        x = x >> 1;
1151
    return count - 1;
1152
}
1153

    
1154
/* Bit count.  */
1155
uint32_t HELPER(neon_cnt_u8)(uint32_t x)
1156
{
1157
    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
1158
    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
1159
    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
1160
    return x;
1161
}
1162

    
1163
#define NEON_QDMULH16(dest, src1, src2, round) do { \
1164
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
1165
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1166
        SET_QC(); \
1167
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1168
    } else { \
1169
        tmp <<= 1; \
1170
    } \
1171
    if (round) { \
1172
        int32_t old = tmp; \
1173
        tmp += 1 << 15; \
1174
        if ((int32_t)tmp < old) { \
1175
            SET_QC(); \
1176
            tmp = SIGNBIT - 1; \
1177
        } \
1178
    } \
1179
    dest = tmp >> 16; \
1180
    } while(0)
1181
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1182
NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
1183
#undef NEON_FN
1184
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1185
NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
1186
#undef NEON_FN
1187
#undef NEON_QDMULH16
1188

    
1189
#define NEON_QDMULH32(dest, src1, src2, round) do { \
1190
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1191
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1192
        SET_QC(); \
1193
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1194
    } else { \
1195
        tmp <<= 1; \
1196
    } \
1197
    if (round) { \
1198
        int64_t old = tmp; \
1199
        tmp += (int64_t)1 << 31; \
1200
        if ((int64_t)tmp < old) { \
1201
            SET_QC(); \
1202
            tmp = SIGNBIT64 - 1; \
1203
        } \
1204
    } \
1205
    dest = tmp >> 32; \
1206
    } while(0)
1207
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1208
NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
1209
#undef NEON_FN
1210
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1211
NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
1212
#undef NEON_FN
1213
#undef NEON_QDMULH32
1214

    
1215
uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1216
{
1217
    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1218
           | ((x >> 24) & 0xff000000u);
1219
}
1220

    
1221
uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1222
{
1223
    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1224
}
1225

    
1226
uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1227
{
1228
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1229
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1230
}
1231

    
1232
uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1233
{
1234
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1235
}
1236

    
1237
uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1238
{
1239
    x &= 0xff80ff80ff80ff80ull;
1240
    x += 0x0080008000800080ull;
1241
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1242
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1243
}
1244

    
1245
uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1246
{
1247
    x &= 0xffff8000ffff8000ull;
1248
    x += 0x0000800000008000ull;
1249
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1250
}
1251

    
1252
uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x)
1253
{
1254
    uint16_t s;
1255
    uint8_t d;
1256
    uint32_t res = 0;
1257
#define SAT8(n) \
1258
    s = x >> n; \
1259
    if (s & 0x8000) { \
1260
        SET_QC(); \
1261
    } else { \
1262
        if (s > 0xff) { \
1263
            d = 0xff; \
1264
            SET_QC(); \
1265
        } else  { \
1266
            d = s; \
1267
        } \
1268
        res |= (uint32_t)d << (n / 2); \
1269
    }
1270

    
1271
    SAT8(0);
1272
    SAT8(16);
1273
    SAT8(32);
1274
    SAT8(48);
1275
#undef SAT8
1276
    return res;
1277
}
1278

    
1279
uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
1280
{
1281
    uint16_t s;
1282
    uint8_t d;
1283
    uint32_t res = 0;
1284
#define SAT8(n) \
1285
    s = x >> n; \
1286
    if (s > 0xff) { \
1287
        d = 0xff; \
1288
        SET_QC(); \
1289
    } else  { \
1290
        d = s; \
1291
    } \
1292
    res |= (uint32_t)d << (n / 2);
1293

    
1294
    SAT8(0);
1295
    SAT8(16);
1296
    SAT8(32);
1297
    SAT8(48);
1298
#undef SAT8
1299
    return res;
1300
}
1301

    
1302
uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
1303
{
1304
    int16_t s;
1305
    uint8_t d;
1306
    uint32_t res = 0;
1307
#define SAT8(n) \
1308
    s = x >> n; \
1309
    if (s != (int8_t)s) { \
1310
        d = (s >> 15) ^ 0x7f; \
1311
        SET_QC(); \
1312
    } else  { \
1313
        d = s; \
1314
    } \
1315
    res |= (uint32_t)d << (n / 2);
1316

    
1317
    SAT8(0);
1318
    SAT8(16);
1319
    SAT8(32);
1320
    SAT8(48);
1321
#undef SAT8
1322
    return res;
1323
}
1324

    
1325
uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x)
1326
{
1327
    uint32_t high;
1328
    uint32_t low;
1329
    low = x;
1330
    if (low & 0x80000000) {
1331
        low = 0;
1332
        SET_QC();
1333
    } else if (low > 0xffff) {
1334
        low = 0xffff;
1335
        SET_QC();
1336
    }
1337
    high = x >> 32;
1338
    if (high & 0x80000000) {
1339
        high = 0;
1340
        SET_QC();
1341
    } else if (high > 0xffff) {
1342
        high = 0xffff;
1343
        SET_QC();
1344
    }
1345
    return low | (high << 16);
1346
}
1347

    
1348
uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
1349
{
1350
    uint32_t high;
1351
    uint32_t low;
1352
    low = x;
1353
    if (low > 0xffff) {
1354
        low = 0xffff;
1355
        SET_QC();
1356
    }
1357
    high = x >> 32;
1358
    if (high > 0xffff) {
1359
        high = 0xffff;
1360
        SET_QC();
1361
    }
1362
    return low | (high << 16);
1363
}
1364

    
1365
uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
1366
{
1367
    int32_t low;
1368
    int32_t high;
1369
    low = x;
1370
    if (low != (int16_t)low) {
1371
        low = (low >> 31) ^ 0x7fff;
1372
        SET_QC();
1373
    }
1374
    high = x >> 32;
1375
    if (high != (int16_t)high) {
1376
        high = (high >> 31) ^ 0x7fff;
1377
        SET_QC();
1378
    }
1379
    return (uint16_t)low | (high << 16);
1380
}
1381

    
1382
uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x)
1383
{
1384
    if (x & 0x8000000000000000ull) {
1385
        SET_QC();
1386
        return 0;
1387
    }
1388
    if (x > 0xffffffffu) {
1389
        SET_QC();
1390
        return 0xffffffffu;
1391
    }
1392
    return x;
1393
}
1394

    
1395
uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
1396
{
1397
    if (x > 0xffffffffu) {
1398
        SET_QC();
1399
        return 0xffffffffu;
1400
    }
1401
    return x;
1402
}
1403

    
1404
uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
1405
{
1406
    if ((int64_t)x != (int32_t)x) {
1407
        SET_QC();
1408
        return ((int64_t)x >> 63) ^ 0x7fffffff;
1409
    }
1410
    return x;
1411
}
1412

    
1413
uint64_t HELPER(neon_widen_u8)(uint32_t x)
1414
{
1415
    uint64_t tmp;
1416
    uint64_t ret;
1417
    ret = (uint8_t)x;
1418
    tmp = (uint8_t)(x >> 8);
1419
    ret |= tmp << 16;
1420
    tmp = (uint8_t)(x >> 16);
1421
    ret |= tmp << 32;
1422
    tmp = (uint8_t)(x >> 24);
1423
    ret |= tmp << 48;
1424
    return ret;
1425
}
1426

    
1427
uint64_t HELPER(neon_widen_s8)(uint32_t x)
1428
{
1429
    uint64_t tmp;
1430
    uint64_t ret;
1431
    ret = (uint16_t)(int8_t)x;
1432
    tmp = (uint16_t)(int8_t)(x >> 8);
1433
    ret |= tmp << 16;
1434
    tmp = (uint16_t)(int8_t)(x >> 16);
1435
    ret |= tmp << 32;
1436
    tmp = (uint16_t)(int8_t)(x >> 24);
1437
    ret |= tmp << 48;
1438
    return ret;
1439
}
1440

    
1441
uint64_t HELPER(neon_widen_u16)(uint32_t x)
1442
{
1443
    uint64_t high = (uint16_t)(x >> 16);
1444
    return ((uint16_t)x) | (high << 32);
1445
}
1446

    
1447
uint64_t HELPER(neon_widen_s16)(uint32_t x)
1448
{
1449
    uint64_t high = (int16_t)(x >> 16);
1450
    return ((uint32_t)(int16_t)x) | (high << 32);
1451
}
1452

    
1453
uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1454
{
1455
    uint64_t mask;
1456
    mask = (a ^ b) & 0x8000800080008000ull;
1457
    a &= ~0x8000800080008000ull;
1458
    b &= ~0x8000800080008000ull;
1459
    return (a + b) ^ mask;
1460
}
1461

    
1462
uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1463
{
1464
    uint64_t mask;
1465
    mask = (a ^ b) & 0x8000000080000000ull;
1466
    a &= ~0x8000000080000000ull;
1467
    b &= ~0x8000000080000000ull;
1468
    return (a + b) ^ mask;
1469
}
1470

    
1471
uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1472
{
1473
    uint64_t tmp;
1474
    uint64_t tmp2;
1475

    
1476
    tmp = a & 0x0000ffff0000ffffull;
1477
    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1478
    tmp2 = b & 0xffff0000ffff0000ull;
1479
    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1480
    return    ( tmp         & 0xffff)
1481
            | ((tmp  >> 16) & 0xffff0000ull)
1482
            | ((tmp2 << 16) & 0xffff00000000ull)
1483
            | ( tmp2        & 0xffff000000000000ull);
1484
}
1485

    
1486
uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1487
{
1488
    uint32_t low = a + (a >> 32);
1489
    uint32_t high = b + (b >> 32);
1490
    return low + ((uint64_t)high << 32);
1491
}
1492

    
1493
uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1494
{
1495
    uint64_t mask;
1496
    mask = (a ^ ~b) & 0x8000800080008000ull;
1497
    a |= 0x8000800080008000ull;
1498
    b &= ~0x8000800080008000ull;
1499
    return (a - b) ^ mask;
1500
}
1501

    
1502
uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1503
{
1504
    uint64_t mask;
1505
    mask = (a ^ ~b) & 0x8000000080000000ull;
1506
    a |= 0x8000000080000000ull;
1507
    b &= ~0x8000000080000000ull;
1508
    return (a - b) ^ mask;
1509
}
1510

    
1511
uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
1512
{
1513
    uint32_t x, y;
1514
    uint32_t low, high;
1515

    
1516
    x = a;
1517
    y = b;
1518
    low = x + y;
1519
    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1520
        SET_QC();
1521
        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1522
    }
1523
    x = a >> 32;
1524
    y = b >> 32;
1525
    high = x + y;
1526
    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1527
        SET_QC();
1528
        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1529
    }
1530
    return low | ((uint64_t)high << 32);
1531
}
1532

    
1533
uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
1534
{
1535
    uint64_t result;
1536

    
1537
    result = a + b;
1538
    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1539
        SET_QC();
1540
        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1541
    }
1542
    return result;
1543
}
1544

    
1545
#define DO_ABD(dest, x, y, type) do { \
1546
    type tmp_x = x; \
1547
    type tmp_y = y; \
1548
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1549
    } while(0)
1550

    
1551
uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1552
{
1553
    uint64_t tmp;
1554
    uint64_t result;
1555
    DO_ABD(result, a, b, uint8_t);
1556
    DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
1557
    result |= tmp << 16;
1558
    DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
1559
    result |= tmp << 32;
1560
    DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
1561
    result |= tmp << 48;
1562
    return result;
1563
}
1564

    
1565
uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1566
{
1567
    uint64_t tmp;
1568
    uint64_t result;
1569
    DO_ABD(result, a, b, int8_t);
1570
    DO_ABD(tmp, a >> 8, b >> 8, int8_t);
1571
    result |= tmp << 16;
1572
    DO_ABD(tmp, a >> 16, b >> 16, int8_t);
1573
    result |= tmp << 32;
1574
    DO_ABD(tmp, a >> 24, b >> 24, int8_t);
1575
    result |= tmp << 48;
1576
    return result;
1577
}
1578

    
1579
uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1580
{
1581
    uint64_t tmp;
1582
    uint64_t result;
1583
    DO_ABD(result, a, b, uint16_t);
1584
    DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
1585
    return result | (tmp << 32);
1586
}
1587

    
1588
uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1589
{
1590
    uint64_t tmp;
1591
    uint64_t result;
1592
    DO_ABD(result, a, b, int16_t);
1593
    DO_ABD(tmp, a >> 16, b >> 16, int16_t);
1594
    return result | (tmp << 32);
1595
}
1596

    
1597
uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1598
{
1599
    uint64_t result;
1600
    DO_ABD(result, a, b, uint32_t);
1601
    return result;
1602
}
1603

    
1604
uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1605
{
1606
    uint64_t result;
1607
    DO_ABD(result, a, b, int32_t);
1608
    return result;
1609
}
1610
#undef DO_ABD
1611

    
1612
/* Widening multiply. Named type is the source type.  */
1613
#define DO_MULL(dest, x, y, type1, type2) do { \
1614
    type1 tmp_x = x; \
1615
    type1 tmp_y = y; \
1616
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1617
    } while(0)
1618

    
1619
uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1620
{
1621
    uint64_t tmp;
1622
    uint64_t result;
1623

    
1624
    DO_MULL(result, a, b, uint8_t, uint16_t);
1625
    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1626
    result |= tmp << 16;
1627
    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1628
    result |= tmp << 32;
1629
    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1630
    result |= tmp << 48;
1631
    return result;
1632
}
1633

    
1634
uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1635
{
1636
    uint64_t tmp;
1637
    uint64_t result;
1638

    
1639
    DO_MULL(result, a, b, int8_t, uint16_t);
1640
    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1641
    result |= tmp << 16;
1642
    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1643
    result |= tmp << 32;
1644
    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1645
    result |= tmp << 48;
1646
    return result;
1647
}
1648

    
1649
uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1650
{
1651
    uint64_t tmp;
1652
    uint64_t result;
1653

    
1654
    DO_MULL(result, a, b, uint16_t, uint32_t);
1655
    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1656
    return result | (tmp << 32);
1657
}
1658

    
1659
uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1660
{
1661
    uint64_t tmp;
1662
    uint64_t result;
1663

    
1664
    DO_MULL(result, a, b, int16_t, uint32_t);
1665
    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1666
    return result | (tmp << 32);
1667
}
1668

    
1669
uint64_t HELPER(neon_negl_u16)(uint64_t x)
1670
{
1671
    uint16_t tmp;
1672
    uint64_t result;
1673
    result = (uint16_t)-x;
1674
    tmp = -(x >> 16);
1675
    result |= (uint64_t)tmp << 16;
1676
    tmp = -(x >> 32);
1677
    result |= (uint64_t)tmp << 32;
1678
    tmp = -(x >> 48);
1679
    result |= (uint64_t)tmp << 48;
1680
    return result;
1681
}
1682

    
1683
uint64_t HELPER(neon_negl_u32)(uint64_t x)
1684
{
1685
    uint32_t low = -x;
1686
    uint32_t high = -(x >> 32);
1687
    return low | ((uint64_t)high << 32);
1688
}
1689

    
1690
/* FIXME:  There should be a native op for this.  */
1691
uint64_t HELPER(neon_negl_u64)(uint64_t x)
1692
{
1693
    return -x;
1694
}
1695

    
1696
/* Saturnating sign manuipulation.  */
1697
/* ??? Make these use NEON_VOP1 */
1698
#define DO_QABS8(x) do { \
1699
    if (x == (int8_t)0x80) { \
1700
        x = 0x7f; \
1701
        SET_QC(); \
1702
    } else if (x < 0) { \
1703
        x = -x; \
1704
    }} while (0)
1705
uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
1706
{
1707
    neon_s8 vec;
1708
    NEON_UNPACK(neon_s8, vec, x);
1709
    DO_QABS8(vec.v1);
1710
    DO_QABS8(vec.v2);
1711
    DO_QABS8(vec.v3);
1712
    DO_QABS8(vec.v4);
1713
    NEON_PACK(neon_s8, x, vec);
1714
    return x;
1715
}
1716
#undef DO_QABS8
1717

    
1718
#define DO_QNEG8(x) do { \
1719
    if (x == (int8_t)0x80) { \
1720
        x = 0x7f; \
1721
        SET_QC(); \
1722
    } else { \
1723
        x = -x; \
1724
    }} while (0)
1725
uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
1726
{
1727
    neon_s8 vec;
1728
    NEON_UNPACK(neon_s8, vec, x);
1729
    DO_QNEG8(vec.v1);
1730
    DO_QNEG8(vec.v2);
1731
    DO_QNEG8(vec.v3);
1732
    DO_QNEG8(vec.v4);
1733
    NEON_PACK(neon_s8, x, vec);
1734
    return x;
1735
}
1736
#undef DO_QNEG8
1737

    
1738
#define DO_QABS16(x) do { \
1739
    if (x == (int16_t)0x8000) { \
1740
        x = 0x7fff; \
1741
        SET_QC(); \
1742
    } else if (x < 0) { \
1743
        x = -x; \
1744
    }} while (0)
1745
uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
1746
{
1747
    neon_s16 vec;
1748
    NEON_UNPACK(neon_s16, vec, x);
1749
    DO_QABS16(vec.v1);
1750
    DO_QABS16(vec.v2);
1751
    NEON_PACK(neon_s16, x, vec);
1752
    return x;
1753
}
1754
#undef DO_QABS16
1755

    
1756
#define DO_QNEG16(x) do { \
1757
    if (x == (int16_t)0x8000) { \
1758
        x = 0x7fff; \
1759
        SET_QC(); \
1760
    } else { \
1761
        x = -x; \
1762
    }} while (0)
1763
uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
1764
{
1765
    neon_s16 vec;
1766
    NEON_UNPACK(neon_s16, vec, x);
1767
    DO_QNEG16(vec.v1);
1768
    DO_QNEG16(vec.v2);
1769
    NEON_PACK(neon_s16, x, vec);
1770
    return x;
1771
}
1772
#undef DO_QNEG16
1773

    
1774
uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
1775
{
1776
    if (x == SIGNBIT) {
1777
        SET_QC();
1778
        x = ~SIGNBIT;
1779
    } else if ((int32_t)x < 0) {
1780
        x = -x;
1781
    }
1782
    return x;
1783
}
1784

    
1785
uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
1786
{
1787
    if (x == SIGNBIT) {
1788
        SET_QC();
1789
        x = ~SIGNBIT;
1790
    } else {
1791
        x = -x;
1792
    }
1793
    return x;
1794
}
1795

    
1796
/* NEON Float helpers.  */
1797
uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
1798
{
1799
    float32 f0 = vfp_itos(a);
1800
    float32 f1 = vfp_itos(b);
1801
    return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
1802
}
1803

    
1804
uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
1805
{
1806
    float32 f0 = vfp_itos(a);
1807
    float32 f1 = vfp_itos(b);
1808
    return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
1809
}
1810

    
1811
uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
1812
{
1813
    float32 f0 = vfp_itos(a);
1814
    float32 f1 = vfp_itos(b);
1815
    return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
1816
                    ? float32_sub(f0, f1, NFS)
1817
                    : float32_sub(f1, f0, NFS));
1818
}
1819

    
1820
uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
1821
{
1822
    return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
1823
}
1824

    
1825
uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
1826
{
1827
    return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
1828
}
1829

    
1830
uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
1831
{
1832
    return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
1833
}
1834

    
1835
/* Floating point comparisons produce an integer result.  */
1836
#define NEON_VOP_FCMP(name, cmp) \
1837
uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
1838
{ \
1839
    if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
1840
        return ~0; \
1841
    else \
1842
        return 0; \
1843
}
1844

    
1845
NEON_VOP_FCMP(ceq_f32, ==)
1846
NEON_VOP_FCMP(cge_f32, >=)
1847
NEON_VOP_FCMP(cgt_f32, >)
1848

    
1849
uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
1850
{
1851
    float32 f0 = float32_abs(vfp_itos(a));
1852
    float32 f1 = float32_abs(vfp_itos(b));
1853
    return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
1854
}
1855

    
1856
uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
1857
{
1858
    float32 f0 = float32_abs(vfp_itos(a));
1859
    float32 f1 = float32_abs(vfp_itos(b));
1860
    return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
1861
}
1862

    
1863
#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1864

    
1865
void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)
1866
{
1867
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1868
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1869
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1870
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1871
    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1872
        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1873
        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1874
        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1875
    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1876
        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1877
        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1878
        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1879
    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1880
        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1881
        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1882
        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1883
    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1884
        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1885
        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1886
        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1887
    env->vfp.regs[rm] = make_float64(m0);
1888
    env->vfp.regs[rm + 1] = make_float64(m1);
1889
    env->vfp.regs[rd] = make_float64(d0);
1890
    env->vfp.regs[rd + 1] = make_float64(d1);
1891
}
1892

    
1893
void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)
1894
{
1895
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1896
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1897
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1898
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1899
    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1900
        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1901
    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1902
        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1903
    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1904
        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1905
    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1906
        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1907
    env->vfp.regs[rm] = make_float64(m0);
1908
    env->vfp.regs[rm + 1] = make_float64(m1);
1909
    env->vfp.regs[rd] = make_float64(d0);
1910
    env->vfp.regs[rd + 1] = make_float64(d1);
1911
}
1912

    
1913
void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)
1914
{
1915
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1916
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1917
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1918
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1919
    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1920
    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1921
    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1922
    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1923
    env->vfp.regs[rm] = make_float64(m0);
1924
    env->vfp.regs[rm + 1] = make_float64(m1);
1925
    env->vfp.regs[rd] = make_float64(d0);
1926
    env->vfp.regs[rd + 1] = make_float64(d1);
1927
}
1928

    
1929
void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)
1930
{
1931
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1932
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1933
    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1934
        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1935
        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1936
        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1937
    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1938
        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1939
        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1940
        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1941
    env->vfp.regs[rm] = make_float64(m0);
1942
    env->vfp.regs[rd] = make_float64(d0);
1943
}
1944

    
1945
void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
1946
{
1947
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1948
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1949
    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1950
        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1951
    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1952
        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1953
    env->vfp.regs[rm] = make_float64(m0);
1954
    env->vfp.regs[rd] = make_float64(d0);
1955
}
1956

    
1957
void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm)
1958
{
1959
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1960
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1961
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1962
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1963
    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1964
        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1965
        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1966
        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1967
    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1968
        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1969
        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1970
        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1971
    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1972
        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1973
        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1974
        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1975
    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1976
        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1977
        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1978
        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1979
    env->vfp.regs[rm] = make_float64(m0);
1980
    env->vfp.regs[rm + 1] = make_float64(m1);
1981
    env->vfp.regs[rd] = make_float64(d0);
1982
    env->vfp.regs[rd + 1] = make_float64(d1);
1983
}
1984

    
1985
void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm)
1986
{
1987
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1988
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1989
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1990
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1991
    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1992
        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1993
    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1994
        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1995
    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1996
        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1997
    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1998
        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1999
    env->vfp.regs[rm] = make_float64(m0);
2000
    env->vfp.regs[rm + 1] = make_float64(m1);
2001
    env->vfp.regs[rd] = make_float64(d0);
2002
    env->vfp.regs[rd + 1] = make_float64(d1);
2003
}
2004

    
2005
void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm)
2006
{
2007
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
2008
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
2009
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
2010
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
2011
    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
2012
    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
2013
    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
2014
    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
2015
    env->vfp.regs[rm] = make_float64(m0);
2016
    env->vfp.regs[rm + 1] = make_float64(m1);
2017
    env->vfp.regs[rd] = make_float64(d0);
2018
    env->vfp.regs[rd + 1] = make_float64(d1);
2019
}
2020

    
2021
void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm)
2022
{
2023
    uint64_t zm = float64_val(env->vfp.regs[rm]);
2024
    uint64_t zd = float64_val(env->vfp.regs[rd]);
2025
    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
2026
        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
2027
        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
2028
        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
2029
    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
2030
        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
2031
        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
2032
        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2033
    env->vfp.regs[rm] = make_float64(m0);
2034
    env->vfp.regs[rd] = make_float64(d0);
2035
}
2036

    
2037
void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm)
2038
{
2039
    uint64_t zm = float64_val(env->vfp.regs[rm]);
2040
    uint64_t zd = float64_val(env->vfp.regs[rd]);
2041
    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
2042
        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
2043
    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
2044
        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2045
    env->vfp.regs[rm] = make_float64(m0);
2046
    env->vfp.regs[rd] = make_float64(d0);
2047
}