Statistics
| Branch: | Revision:

root / target-arm / neon_helper.c @ 4bd4ee07

History | View | Annotate | Download (51.5 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007, 2008 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GNU GPL v2.
8
 */
9
#include <stdlib.h>
10
#include <stdio.h>
11

    
12
#include "cpu.h"
13
#include "exec-all.h"
14
#include "helpers.h"
15

    
16
#define SIGNBIT (uint32_t)0x80000000
17
#define SIGNBIT64 ((uint64_t)1 << 63)
18

    
19
#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
20

    
21
static float_status neon_float_status;
22
#define NFS &neon_float_status
23

    
24
/* Helper routines to perform bitwise copies between float and int.  */
25
static inline float32 vfp_itos(uint32_t i)
26
{
27
    union {
28
        uint32_t i;
29
        float32 s;
30
    } v;
31

    
32
    v.i = i;
33
    return v.s;
34
}
35

    
36
static inline uint32_t vfp_stoi(float32 s)
37
{
38
    union {
39
        uint32_t i;
40
        float32 s;
41
    } v;
42

    
43
    v.s = s;
44
    return v.i;
45
}
46

    
47
#define NEON_TYPE1(name, type) \
48
typedef struct \
49
{ \
50
    type v1; \
51
} neon_##name;
52
#ifdef HOST_WORDS_BIGENDIAN
53
#define NEON_TYPE2(name, type) \
54
typedef struct \
55
{ \
56
    type v2; \
57
    type v1; \
58
} neon_##name;
59
#define NEON_TYPE4(name, type) \
60
typedef struct \
61
{ \
62
    type v4; \
63
    type v3; \
64
    type v2; \
65
    type v1; \
66
} neon_##name;
67
#else
68
#define NEON_TYPE2(name, type) \
69
typedef struct \
70
{ \
71
    type v1; \
72
    type v2; \
73
} neon_##name;
74
#define NEON_TYPE4(name, type) \
75
typedef struct \
76
{ \
77
    type v1; \
78
    type v2; \
79
    type v3; \
80
    type v4; \
81
} neon_##name;
82
#endif
83

    
84
NEON_TYPE4(s8, int8_t)
85
NEON_TYPE4(u8, uint8_t)
86
NEON_TYPE2(s16, int16_t)
87
NEON_TYPE2(u16, uint16_t)
88
NEON_TYPE1(s32, int32_t)
89
NEON_TYPE1(u32, uint32_t)
90
#undef NEON_TYPE4
91
#undef NEON_TYPE2
92
#undef NEON_TYPE1
93

    
94
/* Copy from a uint32_t to a vector structure type.  */
95
#define NEON_UNPACK(vtype, dest, val) do { \
96
    union { \
97
        vtype v; \
98
        uint32_t i; \
99
    } conv_u; \
100
    conv_u.i = (val); \
101
    dest = conv_u.v; \
102
    } while(0)
103

    
104
/* Copy from a vector structure type to a uint32_t.  */
105
#define NEON_PACK(vtype, dest, val) do { \
106
    union { \
107
        vtype v; \
108
        uint32_t i; \
109
    } conv_u; \
110
    conv_u.v = (val); \
111
    dest = conv_u.i; \
112
    } while(0)
113

    
114
#define NEON_DO1 \
115
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
116
#define NEON_DO2 \
117
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
118
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
119
#define NEON_DO4 \
120
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
121
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
122
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
123
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
124

    
125
#define NEON_VOP_BODY(vtype, n) \
126
{ \
127
    uint32_t res; \
128
    vtype vsrc1; \
129
    vtype vsrc2; \
130
    vtype vdest; \
131
    NEON_UNPACK(vtype, vsrc1, arg1); \
132
    NEON_UNPACK(vtype, vsrc2, arg2); \
133
    NEON_DO##n; \
134
    NEON_PACK(vtype, res, vdest); \
135
    return res; \
136
}
137

    
138
#define NEON_VOP(name, vtype, n) \
139
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
140
NEON_VOP_BODY(vtype, n)
141

    
142
#define NEON_VOP_ENV(name, vtype, n) \
143
uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
144
NEON_VOP_BODY(vtype, n)
145

    
146
/* Pairwise operations.  */
147
/* For 32-bit elements each segment only contains a single element, so
148
   the elementwise and pairwise operations are the same.  */
149
#define NEON_PDO2 \
150
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
151
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
152
#define NEON_PDO4 \
153
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
154
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
155
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
156
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
157

    
158
#define NEON_POP(name, vtype, n) \
159
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
160
{ \
161
    uint32_t res; \
162
    vtype vsrc1; \
163
    vtype vsrc2; \
164
    vtype vdest; \
165
    NEON_UNPACK(vtype, vsrc1, arg1); \
166
    NEON_UNPACK(vtype, vsrc2, arg2); \
167
    NEON_PDO##n; \
168
    NEON_PACK(vtype, res, vdest); \
169
    return res; \
170
}
171

    
172
/* Unary operators.  */
173
#define NEON_VOP1(name, vtype, n) \
174
uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
175
{ \
176
    vtype vsrc1; \
177
    vtype vdest; \
178
    NEON_UNPACK(vtype, vsrc1, arg); \
179
    NEON_DO##n; \
180
    NEON_PACK(vtype, arg, vdest); \
181
    return arg; \
182
}
183

    
184

    
185
#define NEON_USAT(dest, src1, src2, type) do { \
186
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
187
    if (tmp != (type)tmp) { \
188
        SET_QC(); \
189
        dest = ~0; \
190
    } else { \
191
        dest = tmp; \
192
    }} while(0)
193
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
194
NEON_VOP_ENV(qadd_u8, neon_u8, 4)
195
#undef NEON_FN
196
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
197
NEON_VOP_ENV(qadd_u16, neon_u16, 2)
198
#undef NEON_FN
199
#undef NEON_USAT
200

    
201
uint32_t HELPER(neon_qadd_u32)(CPUState *env, uint32_t a, uint32_t b)
202
{
203
    uint32_t res = a + b;
204
    if (res < a) {
205
        SET_QC();
206
        res = ~0;
207
    }
208
    return res;
209
}
210

    
211
uint64_t HELPER(neon_qadd_u64)(CPUState *env, uint64_t src1, uint64_t src2)
212
{
213
    uint64_t res;
214

    
215
    res = src1 + src2;
216
    if (res < src1) {
217
        SET_QC();
218
        res = ~(uint64_t)0;
219
    }
220
    return res;
221
}
222

    
223
#define NEON_SSAT(dest, src1, src2, type) do { \
224
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
225
    if (tmp != (type)tmp) { \
226
        SET_QC(); \
227
        if (src2 > 0) { \
228
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
229
        } else { \
230
            tmp = 1 << (sizeof(type) * 8 - 1); \
231
        } \
232
    } \
233
    dest = tmp; \
234
    } while(0)
235
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
236
NEON_VOP_ENV(qadd_s8, neon_s8, 4)
237
#undef NEON_FN
238
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
239
NEON_VOP_ENV(qadd_s16, neon_s16, 2)
240
#undef NEON_FN
241
#undef NEON_SSAT
242

    
243
uint32_t HELPER(neon_qadd_s32)(CPUState *env, uint32_t a, uint32_t b)
244
{
245
    uint32_t res = a + b;
246
    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
247
        SET_QC();
248
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
249
    }
250
    return res;
251
}
252

    
253
uint64_t HELPER(neon_qadd_s64)(CPUState *env, uint64_t src1, uint64_t src2)
254
{
255
    uint64_t res;
256

    
257
    res = src1 + src2;
258
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
259
        SET_QC();
260
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
261
    }
262
    return res;
263
}
264

    
265
#define NEON_USAT(dest, src1, src2, type) do { \
266
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
267
    if (tmp != (type)tmp) { \
268
        SET_QC(); \
269
        dest = 0; \
270
    } else { \
271
        dest = tmp; \
272
    }} while(0)
273
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
274
NEON_VOP_ENV(qsub_u8, neon_u8, 4)
275
#undef NEON_FN
276
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
277
NEON_VOP_ENV(qsub_u16, neon_u16, 2)
278
#undef NEON_FN
279
#undef NEON_USAT
280

    
281
uint32_t HELPER(neon_qsub_u32)(CPUState *env, uint32_t a, uint32_t b)
282
{
283
    uint32_t res = a - b;
284
    if (res > a) {
285
        SET_QC();
286
        res = 0;
287
    }
288
    return res;
289
}
290

    
291
uint64_t HELPER(neon_qsub_u64)(CPUState *env, uint64_t src1, uint64_t src2)
292
{
293
    uint64_t res;
294

    
295
    if (src1 < src2) {
296
        SET_QC();
297
        res = 0;
298
    } else {
299
        res = src1 - src2;
300
    }
301
    return res;
302
}
303

    
304
#define NEON_SSAT(dest, src1, src2, type) do { \
305
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
306
    if (tmp != (type)tmp) { \
307
        SET_QC(); \
308
        if (src2 < 0) { \
309
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
310
        } else { \
311
            tmp = 1 << (sizeof(type) * 8 - 1); \
312
        } \
313
    } \
314
    dest = tmp; \
315
    } while(0)
316
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
317
NEON_VOP_ENV(qsub_s8, neon_s8, 4)
318
#undef NEON_FN
319
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
320
NEON_VOP_ENV(qsub_s16, neon_s16, 2)
321
#undef NEON_FN
322
#undef NEON_SSAT
323

    
324
uint32_t HELPER(neon_qsub_s32)(CPUState *env, uint32_t a, uint32_t b)
325
{
326
    uint32_t res = a - b;
327
    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
328
        SET_QC();
329
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
330
    }
331
    return res;
332
}
333

    
334
uint64_t HELPER(neon_qsub_s64)(CPUState *env, uint64_t src1, uint64_t src2)
335
{
336
    uint64_t res;
337

    
338
    res = src1 - src2;
339
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
340
        SET_QC();
341
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
342
    }
343
    return res;
344
}
345

    
346
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
347
NEON_VOP(hadd_s8, neon_s8, 4)
348
NEON_VOP(hadd_u8, neon_u8, 4)
349
NEON_VOP(hadd_s16, neon_s16, 2)
350
NEON_VOP(hadd_u16, neon_u16, 2)
351
#undef NEON_FN
352

    
353
int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
354
{
355
    int32_t dest;
356

    
357
    dest = (src1 >> 1) + (src2 >> 1);
358
    if (src1 & src2 & 1)
359
        dest++;
360
    return dest;
361
}
362

    
363
uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
364
{
365
    uint32_t dest;
366

    
367
    dest = (src1 >> 1) + (src2 >> 1);
368
    if (src1 & src2 & 1)
369
        dest++;
370
    return dest;
371
}
372

    
373
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
374
NEON_VOP(rhadd_s8, neon_s8, 4)
375
NEON_VOP(rhadd_u8, neon_u8, 4)
376
NEON_VOP(rhadd_s16, neon_s16, 2)
377
NEON_VOP(rhadd_u16, neon_u16, 2)
378
#undef NEON_FN
379

    
380
int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
381
{
382
    int32_t dest;
383

    
384
    dest = (src1 >> 1) + (src2 >> 1);
385
    if ((src1 | src2) & 1)
386
        dest++;
387
    return dest;
388
}
389

    
390
uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
391
{
392
    uint32_t dest;
393

    
394
    dest = (src1 >> 1) + (src2 >> 1);
395
    if ((src1 | src2) & 1)
396
        dest++;
397
    return dest;
398
}
399

    
400
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
401
NEON_VOP(hsub_s8, neon_s8, 4)
402
NEON_VOP(hsub_u8, neon_u8, 4)
403
NEON_VOP(hsub_s16, neon_s16, 2)
404
NEON_VOP(hsub_u16, neon_u16, 2)
405
#undef NEON_FN
406

    
407
int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
408
{
409
    int32_t dest;
410

    
411
    dest = (src1 >> 1) - (src2 >> 1);
412
    if ((~src1) & src2 & 1)
413
        dest--;
414
    return dest;
415
}
416

    
417
uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
418
{
419
    uint32_t dest;
420

    
421
    dest = (src1 >> 1) - (src2 >> 1);
422
    if ((~src1) & src2 & 1)
423
        dest--;
424
    return dest;
425
}
426

    
427
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
428
NEON_VOP(cgt_s8, neon_s8, 4)
429
NEON_VOP(cgt_u8, neon_u8, 4)
430
NEON_VOP(cgt_s16, neon_s16, 2)
431
NEON_VOP(cgt_u16, neon_u16, 2)
432
NEON_VOP(cgt_s32, neon_s32, 1)
433
NEON_VOP(cgt_u32, neon_u32, 1)
434
#undef NEON_FN
435

    
436
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
437
NEON_VOP(cge_s8, neon_s8, 4)
438
NEON_VOP(cge_u8, neon_u8, 4)
439
NEON_VOP(cge_s16, neon_s16, 2)
440
NEON_VOP(cge_u16, neon_u16, 2)
441
NEON_VOP(cge_s32, neon_s32, 1)
442
NEON_VOP(cge_u32, neon_u32, 1)
443
#undef NEON_FN
444

    
445
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
446
NEON_VOP(min_s8, neon_s8, 4)
447
NEON_VOP(min_u8, neon_u8, 4)
448
NEON_VOP(min_s16, neon_s16, 2)
449
NEON_VOP(min_u16, neon_u16, 2)
450
NEON_VOP(min_s32, neon_s32, 1)
451
NEON_VOP(min_u32, neon_u32, 1)
452
NEON_POP(pmin_s8, neon_s8, 4)
453
NEON_POP(pmin_u8, neon_u8, 4)
454
NEON_POP(pmin_s16, neon_s16, 2)
455
NEON_POP(pmin_u16, neon_u16, 2)
456
#undef NEON_FN
457

    
458
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
459
NEON_VOP(max_s8, neon_s8, 4)
460
NEON_VOP(max_u8, neon_u8, 4)
461
NEON_VOP(max_s16, neon_s16, 2)
462
NEON_VOP(max_u16, neon_u16, 2)
463
NEON_VOP(max_s32, neon_s32, 1)
464
NEON_VOP(max_u32, neon_u32, 1)
465
NEON_POP(pmax_s8, neon_s8, 4)
466
NEON_POP(pmax_u8, neon_u8, 4)
467
NEON_POP(pmax_s16, neon_s16, 2)
468
NEON_POP(pmax_u16, neon_u16, 2)
469
#undef NEON_FN
470

    
471
#define NEON_FN(dest, src1, src2) \
472
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
473
NEON_VOP(abd_s8, neon_s8, 4)
474
NEON_VOP(abd_u8, neon_u8, 4)
475
NEON_VOP(abd_s16, neon_s16, 2)
476
NEON_VOP(abd_u16, neon_u16, 2)
477
NEON_VOP(abd_s32, neon_s32, 1)
478
NEON_VOP(abd_u32, neon_u32, 1)
479
#undef NEON_FN
480

    
481
#define NEON_FN(dest, src1, src2) do { \
482
    int8_t tmp; \
483
    tmp = (int8_t)src2; \
484
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
485
        tmp <= -(ssize_t)sizeof(src1) * 8) { \
486
        dest = 0; \
487
    } else if (tmp < 0) { \
488
        dest = src1 >> -tmp; \
489
    } else { \
490
        dest = src1 << tmp; \
491
    }} while (0)
492
NEON_VOP(shl_u8, neon_u8, 4)
493
NEON_VOP(shl_u16, neon_u16, 2)
494
NEON_VOP(shl_u32, neon_u32, 1)
495
#undef NEON_FN
496

    
497
uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
498
{
499
    int8_t shift = (int8_t)shiftop;
500
    if (shift >= 64 || shift <= -64) {
501
        val = 0;
502
    } else if (shift < 0) {
503
        val >>= -shift;
504
    } else {
505
        val <<= shift;
506
    }
507
    return val;
508
}
509

    
510
#define NEON_FN(dest, src1, src2) do { \
511
    int8_t tmp; \
512
    tmp = (int8_t)src2; \
513
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
514
        dest = 0; \
515
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
516
        dest = src1 >> (sizeof(src1) * 8 - 1); \
517
    } else if (tmp < 0) { \
518
        dest = src1 >> -tmp; \
519
    } else { \
520
        dest = src1 << tmp; \
521
    }} while (0)
522
NEON_VOP(shl_s8, neon_s8, 4)
523
NEON_VOP(shl_s16, neon_s16, 2)
524
NEON_VOP(shl_s32, neon_s32, 1)
525
#undef NEON_FN
526

    
527
uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
528
{
529
    int8_t shift = (int8_t)shiftop;
530
    int64_t val = valop;
531
    if (shift >= 64) {
532
        val = 0;
533
    } else if (shift <= -64) {
534
        val >>= 63;
535
    } else if (shift < 0) {
536
        val >>= -shift;
537
    } else {
538
        val <<= shift;
539
    }
540
    return val;
541
}
542

    
543
#define NEON_FN(dest, src1, src2) do { \
544
    int8_t tmp; \
545
    tmp = (int8_t)src2; \
546
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
547
        dest = 0; \
548
    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
549
        dest = src1 >> (sizeof(src1) * 8 - 1); \
550
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
551
        dest = src1 >> (tmp - 1); \
552
        dest++; \
553
        dest >>= 1; \
554
    } else if (tmp < 0) { \
555
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
556
    } else { \
557
        dest = src1 << tmp; \
558
    }} while (0)
559
NEON_VOP(rshl_s8, neon_s8, 4)
560
NEON_VOP(rshl_s16, neon_s16, 2)
561
#undef NEON_FN
562

    
563
/* The addition of the rounding constant may overflow, so we use an
564
 * intermediate 64 bits accumulator.  */
565
uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
566
{
567
    int32_t dest;
568
    int32_t val = (int32_t)valop;
569
    int8_t shift = (int8_t)shiftop;
570
    if ((shift >= 32) || (shift <= -32)) {
571
        dest = 0;
572
    } else if (shift < 0) {
573
        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
574
        dest = big_dest >> -shift;
575
    } else {
576
        dest = val << shift;
577
    }
578
    return dest;
579
}
580

    
581
/* Handling addition overflow with 64 bits inputs values is more
582
 * tricky than with 32 bits values.  */
583
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
584
{
585
    int8_t shift = (int8_t)shiftop;
586
    int64_t val = valop;
587
    if (shift >= 64) {
588
        val = 0;
589
    } else if (shift < -64) {
590
        val >>= 63;
591
    } else if (shift == -63) {
592
        val >>= 63;
593
        val++;
594
        val >>= 1;
595
    } else if (shift < 0) {
596
        val >>= (-shift - 1);
597
        if (val == INT64_MAX) {
598
            /* In this case, it means that the rounding constant is 1,
599
             * and the addition would overflow. Return the actual
600
             * result directly.  */
601
            val = 0x4000000000000000LL;
602
        } else {
603
            val++;
604
            val >>= 1;
605
        }
606
    } else {
607
        val <<= shift;
608
    }
609
    return val;
610
}
611

    
612
#define NEON_FN(dest, src1, src2) do { \
613
    int8_t tmp; \
614
    tmp = (int8_t)src2; \
615
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
616
        tmp < -(ssize_t)sizeof(src1) * 8) { \
617
        dest = 0; \
618
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
619
        dest = src1 >> (tmp - 1); \
620
    } else if (tmp < 0) { \
621
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
622
    } else { \
623
        dest = src1 << tmp; \
624
    }} while (0)
625
NEON_VOP(rshl_u8, neon_u8, 4)
626
NEON_VOP(rshl_u16, neon_u16, 2)
627
#undef NEON_FN
628

    
629
/* The addition of the rounding constant may overflow, so we use an
630
 * intermediate 64 bits accumulator.  */
631
uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
632
{
633
    uint32_t dest;
634
    int8_t shift = (int8_t)shiftop;
635
    if (shift >= 32 || shift < -32) {
636
        dest = 0;
637
    } else if (shift == -32) {
638
        dest = val >> 31;
639
    } else if (shift < 0) {
640
        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
641
        dest = big_dest >> -shift;
642
    } else {
643
        dest = val << shift;
644
    }
645
    return dest;
646
}
647

    
648
/* Handling addition overflow with 64 bits inputs values is more
649
 * tricky than with 32 bits values.  */
650
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
651
{
652
    int8_t shift = (uint8_t)shiftop;
653
    if (shift >= 64 || shift < 64) {
654
        val = 0;
655
    } else if (shift == -64) {
656
        /* Rounding a 1-bit result just preserves that bit.  */
657
        val >>= 63;
658
    } else if (shift < 0) {
659
        val >>= (-shift - 1);
660
        if (val == UINT64_MAX) {
661
            /* In this case, it means that the rounding constant is 1,
662
             * and the addition would overflow. Return the actual
663
             * result directly.  */
664
            val = 0x8000000000000000ULL;
665
        } else {
666
            val++;
667
            val >>= 1;
668
        }
669
    } else {
670
        val <<= shift;
671
    }
672
    return val;
673
}
674

    
675
#define NEON_FN(dest, src1, src2) do { \
676
    int8_t tmp; \
677
    tmp = (int8_t)src2; \
678
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
679
        if (src1) { \
680
            SET_QC(); \
681
            dest = ~0; \
682
        } else { \
683
            dest = 0; \
684
        } \
685
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
686
        dest = 0; \
687
    } else if (tmp < 0) { \
688
        dest = src1 >> -tmp; \
689
    } else { \
690
        dest = src1 << tmp; \
691
        if ((dest >> tmp) != src1) { \
692
            SET_QC(); \
693
            dest = ~0; \
694
        } \
695
    }} while (0)
696
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
697
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
698
NEON_VOP_ENV(qshl_u32, neon_u32, 1)
699
#undef NEON_FN
700

    
701
uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
702
{
703
    int8_t shift = (int8_t)shiftop;
704
    if (shift >= 64) {
705
        if (val) {
706
            val = ~(uint64_t)0;
707
            SET_QC();
708
        }
709
    } else if (shift <= -64) {
710
        val = 0;
711
    } else if (shift < 0) {
712
        val >>= -shift;
713
    } else {
714
        uint64_t tmp = val;
715
        val <<= shift;
716
        if ((val >> shift) != tmp) {
717
            SET_QC();
718
            val = ~(uint64_t)0;
719
        }
720
    }
721
    return val;
722
}
723

    
724
#define NEON_FN(dest, src1, src2) do { \
725
    int8_t tmp; \
726
    tmp = (int8_t)src2; \
727
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
728
        if (src1) { \
729
            SET_QC(); \
730
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
731
            if (src1 > 0) { \
732
                dest--; \
733
            } \
734
        } else { \
735
            dest = src1; \
736
        } \
737
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
738
        dest = src1 >> 31; \
739
    } else if (tmp < 0) { \
740
        dest = src1 >> -tmp; \
741
    } else { \
742
        dest = src1 << tmp; \
743
        if ((dest >> tmp) != src1) { \
744
            SET_QC(); \
745
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
746
            if (src1 > 0) { \
747
                dest--; \
748
            } \
749
        } \
750
    }} while (0)
751
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
752
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
753
NEON_VOP_ENV(qshl_s32, neon_s32, 1)
754
#undef NEON_FN
755

    
756
uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
757
{
758
    int8_t shift = (uint8_t)shiftop;
759
    int64_t val = valop;
760
    if (shift >= 64) {
761
        if (val) {
762
            SET_QC();
763
            val = (val >> 63) ^ ~SIGNBIT64;
764
        }
765
    } else if (shift <= -64) {
766
        val >>= 63;
767
    } else if (shift < 0) {
768
        val >>= -shift;
769
    } else {
770
        int64_t tmp = val;
771
        val <<= shift;
772
        if ((val >> shift) != tmp) {
773
            SET_QC();
774
            val = (tmp >> 63) ^ ~SIGNBIT64;
775
        }
776
    }
777
    return val;
778
}
779

    
780
#define NEON_FN(dest, src1, src2) do { \
781
    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
782
        SET_QC(); \
783
        dest = 0; \
784
    } else { \
785
        int8_t tmp; \
786
        tmp = (int8_t)src2; \
787
        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
788
            if (src1) { \
789
                SET_QC(); \
790
                dest = ~0; \
791
            } else { \
792
                dest = 0; \
793
            } \
794
        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
795
            dest = 0; \
796
        } else if (tmp < 0) { \
797
            dest = src1 >> -tmp; \
798
        } else { \
799
            dest = src1 << tmp; \
800
            if ((dest >> tmp) != src1) { \
801
                SET_QC(); \
802
                dest = ~0; \
803
            } \
804
        } \
805
    }} while (0)
806
NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
807
NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
808
#undef NEON_FN
809

    
810
uint32_t HELPER(neon_qshlu_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
811
{
812
    if ((int32_t)valop < 0) {
813
        SET_QC();
814
        return 0;
815
    }
816
    return helper_neon_qshl_u32(env, valop, shiftop);
817
}
818

    
819
uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
820
{
821
    if ((int64_t)valop < 0) {
822
        SET_QC();
823
        return 0;
824
    }
825
    return helper_neon_qshl_u64(env, valop, shiftop);
826
}
827

    
828
/* FIXME: This is wrong.  */
829
#define NEON_FN(dest, src1, src2) do { \
830
    int8_t tmp; \
831
    tmp = (int8_t)src2; \
832
    if (tmp < 0) { \
833
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
834
    } else { \
835
        dest = src1 << tmp; \
836
        if ((dest >> tmp) != src1) { \
837
            SET_QC(); \
838
            dest = ~0; \
839
        } \
840
    }} while (0)
841
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
842
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
843
#undef NEON_FN
844

    
845
/* The addition of the rounding constant may overflow, so we use an
846
 * intermediate 64 bits accumulator.  */
847
uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop)
848
{
849
    uint32_t dest;
850
    int8_t shift = (int8_t)shiftop;
851
    if (shift < 0) {
852
        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
853
        dest = big_dest >> -shift;
854
    } else {
855
        dest = val << shift;
856
        if ((dest >> shift) != val) {
857
            SET_QC();
858
            dest = ~0;
859
        }
860
    }
861
    return dest;
862
}
863

    
864
/* Handling addition overflow with 64 bits inputs values is more
865
 * tricky than with 32 bits values.  */
866
uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
867
{
868
    int8_t shift = (int8_t)shiftop;
869
    if (shift < 0) {
870
        val >>= (-shift - 1);
871
        if (val == UINT64_MAX) {
872
            /* In this case, it means that the rounding constant is 1,
873
             * and the addition would overflow. Return the actual
874
             * result directly.  */
875
            val = 0x8000000000000000ULL;
876
        } else {
877
            val++;
878
            val >>= 1;
879
        }
880
    } else { \
881
        uint64_t tmp = val;
882
        val <<= shift;
883
        if ((val >> shift) != tmp) {
884
            SET_QC();
885
            val = ~0;
886
        }
887
    }
888
    return val;
889
}
890

    
891
#define NEON_FN(dest, src1, src2) do { \
892
    int8_t tmp; \
893
    tmp = (int8_t)src2; \
894
    if (tmp < 0) { \
895
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
896
    } else { \
897
        dest = src1 << tmp; \
898
        if ((dest >> tmp) != src1) { \
899
            SET_QC(); \
900
            dest = src1 >> 31; \
901
        } \
902
    }} while (0)
903
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
904
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
905
#undef NEON_FN
906

    
907
/* The addition of the rounding constant may overflow, so we use an
908
 * intermediate 64 bits accumulator.  */
909
uint32_t HELPER(neon_qrshl_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
910
{
911
    int32_t dest;
912
    int32_t val = (int32_t)valop;
913
    int8_t shift = (int8_t)shiftop;
914
    if (shift < 0) {
915
        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
916
        dest = big_dest >> -shift;
917
    } else {
918
        dest = val << shift;
919
        if ((dest >> shift) != val) {
920
            SET_QC();
921
            dest = (val >> 31) ^ ~SIGNBIT;
922
        }
923
    }
924
    return dest;
925
}
926

    
927
/* Handling addition overflow with 64 bits inputs values is more
928
 * tricky than with 32 bits values.  */
929
uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
930
{
931
    int8_t shift = (uint8_t)shiftop;
932
    int64_t val = valop;
933

    
934
    if (shift < 0) {
935
        val >>= (-shift - 1);
936
        if (val == INT64_MAX) {
937
            /* In this case, it means that the rounding constant is 1,
938
             * and the addition would overflow. Return the actual
939
             * result directly.  */
940
            val = 0x4000000000000000ULL;
941
        } else {
942
            val++;
943
            val >>= 1;
944
        }
945
    } else {
946
        int64_t tmp = val;
947
        val <<= shift;
948
        if ((val >> shift) != tmp) {
949
            SET_QC();
950
            val = (tmp >> 63) ^ ~SIGNBIT64;
951
        }
952
    }
953
    return val;
954
}
955

    
956
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
957
{
958
    uint32_t mask;
959
    mask = (a ^ b) & 0x80808080u;
960
    a &= ~0x80808080u;
961
    b &= ~0x80808080u;
962
    return (a + b) ^ mask;
963
}
964

    
965
uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
966
{
967
    uint32_t mask;
968
    mask = (a ^ b) & 0x80008000u;
969
    a &= ~0x80008000u;
970
    b &= ~0x80008000u;
971
    return (a + b) ^ mask;
972
}
973

    
974
#define NEON_FN(dest, src1, src2) dest = src1 + src2
975
NEON_POP(padd_u8, neon_u8, 4)
976
NEON_POP(padd_u16, neon_u16, 2)
977
#undef NEON_FN
978

    
979
#define NEON_FN(dest, src1, src2) dest = src1 - src2
980
NEON_VOP(sub_u8, neon_u8, 4)
981
NEON_VOP(sub_u16, neon_u16, 2)
982
#undef NEON_FN
983

    
984
#define NEON_FN(dest, src1, src2) dest = src1 * src2
985
NEON_VOP(mul_u8, neon_u8, 4)
986
NEON_VOP(mul_u16, neon_u16, 2)
987
#undef NEON_FN
988

    
989
/* Polynomial multiplication is like integer multiplication except the
990
   partial products are XORed, not added.  */
991
uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
992
{
993
    uint32_t mask;
994
    uint32_t result;
995
    result = 0;
996
    while (op1) {
997
        mask = 0;
998
        if (op1 & 1)
999
            mask |= 0xff;
1000
        if (op1 & (1 << 8))
1001
            mask |= (0xff << 8);
1002
        if (op1 & (1 << 16))
1003
            mask |= (0xff << 16);
1004
        if (op1 & (1 << 24))
1005
            mask |= (0xff << 24);
1006
        result ^= op2 & mask;
1007
        op1 = (op1 >> 1) & 0x7f7f7f7f;
1008
        op2 = (op2 << 1) & 0xfefefefe;
1009
    }
1010
    return result;
1011
}
1012

    
1013
uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1014
{
1015
    uint64_t result = 0;
1016
    uint64_t mask;
1017
    uint64_t op2ex = op2;
1018
    op2ex = (op2ex & 0xff) |
1019
        ((op2ex & 0xff00) << 8) |
1020
        ((op2ex & 0xff0000) << 16) |
1021
        ((op2ex & 0xff000000) << 24);
1022
    while (op1) {
1023
        mask = 0;
1024
        if (op1 & 1) {
1025
            mask |= 0xffff;
1026
        }
1027
        if (op1 & (1 << 8)) {
1028
            mask |= (0xffffU << 16);
1029
        }
1030
        if (op1 & (1 << 16)) {
1031
            mask |= (0xffffULL << 32);
1032
        }
1033
        if (op1 & (1 << 24)) {
1034
            mask |= (0xffffULL << 48);
1035
        }
1036
        result ^= op2ex & mask;
1037
        op1 = (op1 >> 1) & 0x7f7f7f7f;
1038
        op2ex <<= 1;
1039
    }
1040
    return result;
1041
}
1042

    
1043
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
1044
NEON_VOP(tst_u8, neon_u8, 4)
1045
NEON_VOP(tst_u16, neon_u16, 2)
1046
NEON_VOP(tst_u32, neon_u32, 1)
1047
#undef NEON_FN
1048

    
1049
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
1050
NEON_VOP(ceq_u8, neon_u8, 4)
1051
NEON_VOP(ceq_u16, neon_u16, 2)
1052
NEON_VOP(ceq_u32, neon_u32, 1)
1053
#undef NEON_FN
1054

    
1055
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1056
NEON_VOP1(abs_s8, neon_s8, 4)
1057
NEON_VOP1(abs_s16, neon_s16, 2)
1058
#undef NEON_FN
1059

    
1060
/* Count Leading Sign/Zero Bits.  */
1061
static inline int do_clz8(uint8_t x)
1062
{
1063
    int n;
1064
    for (n = 8; x; n--)
1065
        x >>= 1;
1066
    return n;
1067
}
1068

    
1069
static inline int do_clz16(uint16_t x)
1070
{
1071
    int n;
1072
    for (n = 16; x; n--)
1073
        x >>= 1;
1074
    return n;
1075
}
1076

    
1077
#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
1078
NEON_VOP1(clz_u8, neon_u8, 4)
1079
#undef NEON_FN
1080

    
1081
#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
1082
NEON_VOP1(clz_u16, neon_u16, 2)
1083
#undef NEON_FN
1084

    
1085
#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
1086
NEON_VOP1(cls_s8, neon_s8, 4)
1087
#undef NEON_FN
1088

    
1089
#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
1090
NEON_VOP1(cls_s16, neon_s16, 2)
1091
#undef NEON_FN
1092

    
1093
uint32_t HELPER(neon_cls_s32)(uint32_t x)
1094
{
1095
    int count;
1096
    if ((int32_t)x < 0)
1097
        x = ~x;
1098
    for (count = 32; x; count--)
1099
        x = x >> 1;
1100
    return count - 1;
1101
}
1102

    
1103
/* Bit count.  */
1104
uint32_t HELPER(neon_cnt_u8)(uint32_t x)
1105
{
1106
    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
1107
    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
1108
    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
1109
    return x;
1110
}
1111

    
1112
#define NEON_QDMULH16(dest, src1, src2, round) do { \
1113
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
1114
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1115
        SET_QC(); \
1116
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1117
    } else { \
1118
        tmp <<= 1; \
1119
    } \
1120
    if (round) { \
1121
        int32_t old = tmp; \
1122
        tmp += 1 << 15; \
1123
        if ((int32_t)tmp < old) { \
1124
            SET_QC(); \
1125
            tmp = SIGNBIT - 1; \
1126
        } \
1127
    } \
1128
    dest = tmp >> 16; \
1129
    } while(0)
1130
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1131
NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
1132
#undef NEON_FN
1133
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1134
NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
1135
#undef NEON_FN
1136
#undef NEON_QDMULH16
1137

    
1138
#define NEON_QDMULH32(dest, src1, src2, round) do { \
1139
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1140
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1141
        SET_QC(); \
1142
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1143
    } else { \
1144
        tmp <<= 1; \
1145
    } \
1146
    if (round) { \
1147
        int64_t old = tmp; \
1148
        tmp += (int64_t)1 << 31; \
1149
        if ((int64_t)tmp < old) { \
1150
            SET_QC(); \
1151
            tmp = SIGNBIT64 - 1; \
1152
        } \
1153
    } \
1154
    dest = tmp >> 32; \
1155
    } while(0)
1156
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1157
NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
1158
#undef NEON_FN
1159
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1160
NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
1161
#undef NEON_FN
1162
#undef NEON_QDMULH32
1163

    
1164
uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1165
{
1166
    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1167
           | ((x >> 24) & 0xff000000u);
1168
}
1169

    
1170
uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1171
{
1172
    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1173
}
1174

    
1175
uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1176
{
1177
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1178
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1179
}
1180

    
1181
uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1182
{
1183
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1184
}
1185

    
1186
uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1187
{
1188
    x &= 0xff80ff80ff80ff80ull;
1189
    x += 0x0080008000800080ull;
1190
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1191
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1192
}
1193

    
1194
uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1195
{
1196
    x &= 0xffff8000ffff8000ull;
1197
    x += 0x0000800000008000ull;
1198
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1199
}
1200

    
1201
uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x)
1202
{
1203
    uint16_t s;
1204
    uint8_t d;
1205
    uint32_t res = 0;
1206
#define SAT8(n) \
1207
    s = x >> n; \
1208
    if (s & 0x8000) { \
1209
        SET_QC(); \
1210
    } else { \
1211
        if (s > 0xff) { \
1212
            d = 0xff; \
1213
            SET_QC(); \
1214
        } else  { \
1215
            d = s; \
1216
        } \
1217
        res |= (uint32_t)d << (n / 2); \
1218
    }
1219

    
1220
    SAT8(0);
1221
    SAT8(16);
1222
    SAT8(32);
1223
    SAT8(48);
1224
#undef SAT8
1225
    return res;
1226
}
1227

    
1228
uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
1229
{
1230
    uint16_t s;
1231
    uint8_t d;
1232
    uint32_t res = 0;
1233
#define SAT8(n) \
1234
    s = x >> n; \
1235
    if (s > 0xff) { \
1236
        d = 0xff; \
1237
        SET_QC(); \
1238
    } else  { \
1239
        d = s; \
1240
    } \
1241
    res |= (uint32_t)d << (n / 2);
1242

    
1243
    SAT8(0);
1244
    SAT8(16);
1245
    SAT8(32);
1246
    SAT8(48);
1247
#undef SAT8
1248
    return res;
1249
}
1250

    
1251
uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
1252
{
1253
    int16_t s;
1254
    uint8_t d;
1255
    uint32_t res = 0;
1256
#define SAT8(n) \
1257
    s = x >> n; \
1258
    if (s != (int8_t)s) { \
1259
        d = (s >> 15) ^ 0x7f; \
1260
        SET_QC(); \
1261
    } else  { \
1262
        d = s; \
1263
    } \
1264
    res |= (uint32_t)d << (n / 2);
1265

    
1266
    SAT8(0);
1267
    SAT8(16);
1268
    SAT8(32);
1269
    SAT8(48);
1270
#undef SAT8
1271
    return res;
1272
}
1273

    
1274
uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x)
1275
{
1276
    uint32_t high;
1277
    uint32_t low;
1278
    low = x;
1279
    if (low & 0x80000000) {
1280
        low = 0;
1281
        SET_QC();
1282
    } else if (low > 0xffff) {
1283
        low = 0xffff;
1284
        SET_QC();
1285
    }
1286
    high = x >> 32;
1287
    if (high & 0x80000000) {
1288
        high = 0;
1289
        SET_QC();
1290
    } else if (high > 0xffff) {
1291
        high = 0xffff;
1292
        SET_QC();
1293
    }
1294
    return low | (high << 16);
1295
}
1296

    
1297
uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
1298
{
1299
    uint32_t high;
1300
    uint32_t low;
1301
    low = x;
1302
    if (low > 0xffff) {
1303
        low = 0xffff;
1304
        SET_QC();
1305
    }
1306
    high = x >> 32;
1307
    if (high > 0xffff) {
1308
        high = 0xffff;
1309
        SET_QC();
1310
    }
1311
    return low | (high << 16);
1312
}
1313

    
1314
uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
1315
{
1316
    int32_t low;
1317
    int32_t high;
1318
    low = x;
1319
    if (low != (int16_t)low) {
1320
        low = (low >> 31) ^ 0x7fff;
1321
        SET_QC();
1322
    }
1323
    high = x >> 32;
1324
    if (high != (int16_t)high) {
1325
        high = (high >> 31) ^ 0x7fff;
1326
        SET_QC();
1327
    }
1328
    return (uint16_t)low | (high << 16);
1329
}
1330

    
1331
uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x)
1332
{
1333
    if (x & 0x8000000000000000ull) {
1334
        SET_QC();
1335
        return 0;
1336
    }
1337
    if (x > 0xffffffffu) {
1338
        SET_QC();
1339
        return 0xffffffffu;
1340
    }
1341
    return x;
1342
}
1343

    
1344
uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
1345
{
1346
    if (x > 0xffffffffu) {
1347
        SET_QC();
1348
        return 0xffffffffu;
1349
    }
1350
    return x;
1351
}
1352

    
1353
uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
1354
{
1355
    if ((int64_t)x != (int32_t)x) {
1356
        SET_QC();
1357
        return ((int64_t)x >> 63) ^ 0x7fffffff;
1358
    }
1359
    return x;
1360
}
1361

    
1362
uint64_t HELPER(neon_widen_u8)(uint32_t x)
1363
{
1364
    uint64_t tmp;
1365
    uint64_t ret;
1366
    ret = (uint8_t)x;
1367
    tmp = (uint8_t)(x >> 8);
1368
    ret |= tmp << 16;
1369
    tmp = (uint8_t)(x >> 16);
1370
    ret |= tmp << 32;
1371
    tmp = (uint8_t)(x >> 24);
1372
    ret |= tmp << 48;
1373
    return ret;
1374
}
1375

    
1376
uint64_t HELPER(neon_widen_s8)(uint32_t x)
1377
{
1378
    uint64_t tmp;
1379
    uint64_t ret;
1380
    ret = (uint16_t)(int8_t)x;
1381
    tmp = (uint16_t)(int8_t)(x >> 8);
1382
    ret |= tmp << 16;
1383
    tmp = (uint16_t)(int8_t)(x >> 16);
1384
    ret |= tmp << 32;
1385
    tmp = (uint16_t)(int8_t)(x >> 24);
1386
    ret |= tmp << 48;
1387
    return ret;
1388
}
1389

    
1390
uint64_t HELPER(neon_widen_u16)(uint32_t x)
1391
{
1392
    uint64_t high = (uint16_t)(x >> 16);
1393
    return ((uint16_t)x) | (high << 32);
1394
}
1395

    
1396
uint64_t HELPER(neon_widen_s16)(uint32_t x)
1397
{
1398
    uint64_t high = (int16_t)(x >> 16);
1399
    return ((uint32_t)(int16_t)x) | (high << 32);
1400
}
1401

    
1402
uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1403
{
1404
    uint64_t mask;
1405
    mask = (a ^ b) & 0x8000800080008000ull;
1406
    a &= ~0x8000800080008000ull;
1407
    b &= ~0x8000800080008000ull;
1408
    return (a + b) ^ mask;
1409
}
1410

    
1411
uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1412
{
1413
    uint64_t mask;
1414
    mask = (a ^ b) & 0x8000000080000000ull;
1415
    a &= ~0x8000000080000000ull;
1416
    b &= ~0x8000000080000000ull;
1417
    return (a + b) ^ mask;
1418
}
1419

    
1420
uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1421
{
1422
    uint64_t tmp;
1423
    uint64_t tmp2;
1424

    
1425
    tmp = a & 0x0000ffff0000ffffull;
1426
    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1427
    tmp2 = b & 0xffff0000ffff0000ull;
1428
    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1429
    return    ( tmp         & 0xffff)
1430
            | ((tmp  >> 16) & 0xffff0000ull)
1431
            | ((tmp2 << 16) & 0xffff00000000ull)
1432
            | ( tmp2        & 0xffff000000000000ull);
1433
}
1434

    
1435
uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1436
{
1437
    uint32_t low = a + (a >> 32);
1438
    uint32_t high = b + (b >> 32);
1439
    return low + ((uint64_t)high << 32);
1440
}
1441

    
1442
uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1443
{
1444
    uint64_t mask;
1445
    mask = (a ^ ~b) & 0x8000800080008000ull;
1446
    a |= 0x8000800080008000ull;
1447
    b &= ~0x8000800080008000ull;
1448
    return (a - b) ^ mask;
1449
}
1450

    
1451
uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1452
{
1453
    uint64_t mask;
1454
    mask = (a ^ ~b) & 0x8000000080000000ull;
1455
    a |= 0x8000000080000000ull;
1456
    b &= ~0x8000000080000000ull;
1457
    return (a - b) ^ mask;
1458
}
1459

    
1460
uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
1461
{
1462
    uint32_t x, y;
1463
    uint32_t low, high;
1464

    
1465
    x = a;
1466
    y = b;
1467
    low = x + y;
1468
    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1469
        SET_QC();
1470
        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1471
    }
1472
    x = a >> 32;
1473
    y = b >> 32;
1474
    high = x + y;
1475
    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1476
        SET_QC();
1477
        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1478
    }
1479
    return low | ((uint64_t)high << 32);
1480
}
1481

    
1482
uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
1483
{
1484
    uint64_t result;
1485

    
1486
    result = a + b;
1487
    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1488
        SET_QC();
1489
        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1490
    }
1491
    return result;
1492
}
1493

    
1494
#define DO_ABD(dest, x, y, type) do { \
1495
    type tmp_x = x; \
1496
    type tmp_y = y; \
1497
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1498
    } while(0)
1499

    
1500
uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1501
{
1502
    uint64_t tmp;
1503
    uint64_t result;
1504
    DO_ABD(result, a, b, uint8_t);
1505
    DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
1506
    result |= tmp << 16;
1507
    DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
1508
    result |= tmp << 32;
1509
    DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
1510
    result |= tmp << 48;
1511
    return result;
1512
}
1513

    
1514
uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1515
{
1516
    uint64_t tmp;
1517
    uint64_t result;
1518
    DO_ABD(result, a, b, int8_t);
1519
    DO_ABD(tmp, a >> 8, b >> 8, int8_t);
1520
    result |= tmp << 16;
1521
    DO_ABD(tmp, a >> 16, b >> 16, int8_t);
1522
    result |= tmp << 32;
1523
    DO_ABD(tmp, a >> 24, b >> 24, int8_t);
1524
    result |= tmp << 48;
1525
    return result;
1526
}
1527

    
1528
uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1529
{
1530
    uint64_t tmp;
1531
    uint64_t result;
1532
    DO_ABD(result, a, b, uint16_t);
1533
    DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
1534
    return result | (tmp << 32);
1535
}
1536

    
1537
uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1538
{
1539
    uint64_t tmp;
1540
    uint64_t result;
1541
    DO_ABD(result, a, b, int16_t);
1542
    DO_ABD(tmp, a >> 16, b >> 16, int16_t);
1543
    return result | (tmp << 32);
1544
}
1545

    
1546
uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1547
{
1548
    uint64_t result;
1549
    DO_ABD(result, a, b, uint32_t);
1550
    return result;
1551
}
1552

    
1553
uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1554
{
1555
    uint64_t result;
1556
    DO_ABD(result, a, b, int32_t);
1557
    return result;
1558
}
1559
#undef DO_ABD
1560

    
1561
/* Widening multiply. Named type is the source type.  */
1562
#define DO_MULL(dest, x, y, type1, type2) do { \
1563
    type1 tmp_x = x; \
1564
    type1 tmp_y = y; \
1565
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1566
    } while(0)
1567

    
1568
uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1569
{
1570
    uint64_t tmp;
1571
    uint64_t result;
1572

    
1573
    DO_MULL(result, a, b, uint8_t, uint16_t);
1574
    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1575
    result |= tmp << 16;
1576
    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1577
    result |= tmp << 32;
1578
    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1579
    result |= tmp << 48;
1580
    return result;
1581
}
1582

    
1583
uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1584
{
1585
    uint64_t tmp;
1586
    uint64_t result;
1587

    
1588
    DO_MULL(result, a, b, int8_t, uint16_t);
1589
    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1590
    result |= tmp << 16;
1591
    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1592
    result |= tmp << 32;
1593
    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1594
    result |= tmp << 48;
1595
    return result;
1596
}
1597

    
1598
uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1599
{
1600
    uint64_t tmp;
1601
    uint64_t result;
1602

    
1603
    DO_MULL(result, a, b, uint16_t, uint32_t);
1604
    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1605
    return result | (tmp << 32);
1606
}
1607

    
1608
uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1609
{
1610
    uint64_t tmp;
1611
    uint64_t result;
1612

    
1613
    DO_MULL(result, a, b, int16_t, uint32_t);
1614
    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1615
    return result | (tmp << 32);
1616
}
1617

    
1618
uint64_t HELPER(neon_negl_u16)(uint64_t x)
1619
{
1620
    uint16_t tmp;
1621
    uint64_t result;
1622
    result = (uint16_t)-x;
1623
    tmp = -(x >> 16);
1624
    result |= (uint64_t)tmp << 16;
1625
    tmp = -(x >> 32);
1626
    result |= (uint64_t)tmp << 32;
1627
    tmp = -(x >> 48);
1628
    result |= (uint64_t)tmp << 48;
1629
    return result;
1630
}
1631

    
1632
uint64_t HELPER(neon_negl_u32)(uint64_t x)
1633
{
1634
    uint32_t low = -x;
1635
    uint32_t high = -(x >> 32);
1636
    return low | ((uint64_t)high << 32);
1637
}
1638

    
1639
/* FIXME:  There should be a native op for this.  */
1640
uint64_t HELPER(neon_negl_u64)(uint64_t x)
1641
{
1642
    return -x;
1643
}
1644

    
1645
/* Saturnating sign manuipulation.  */
1646
/* ??? Make these use NEON_VOP1 */
1647
#define DO_QABS8(x) do { \
1648
    if (x == (int8_t)0x80) { \
1649
        x = 0x7f; \
1650
        SET_QC(); \
1651
    } else if (x < 0) { \
1652
        x = -x; \
1653
    }} while (0)
1654
uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
1655
{
1656
    neon_s8 vec;
1657
    NEON_UNPACK(neon_s8, vec, x);
1658
    DO_QABS8(vec.v1);
1659
    DO_QABS8(vec.v2);
1660
    DO_QABS8(vec.v3);
1661
    DO_QABS8(vec.v4);
1662
    NEON_PACK(neon_s8, x, vec);
1663
    return x;
1664
}
1665
#undef DO_QABS8
1666

    
1667
#define DO_QNEG8(x) do { \
1668
    if (x == (int8_t)0x80) { \
1669
        x = 0x7f; \
1670
        SET_QC(); \
1671
    } else { \
1672
        x = -x; \
1673
    }} while (0)
1674
uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
1675
{
1676
    neon_s8 vec;
1677
    NEON_UNPACK(neon_s8, vec, x);
1678
    DO_QNEG8(vec.v1);
1679
    DO_QNEG8(vec.v2);
1680
    DO_QNEG8(vec.v3);
1681
    DO_QNEG8(vec.v4);
1682
    NEON_PACK(neon_s8, x, vec);
1683
    return x;
1684
}
1685
#undef DO_QNEG8
1686

    
1687
#define DO_QABS16(x) do { \
1688
    if (x == (int16_t)0x8000) { \
1689
        x = 0x7fff; \
1690
        SET_QC(); \
1691
    } else if (x < 0) { \
1692
        x = -x; \
1693
    }} while (0)
1694
uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
1695
{
1696
    neon_s16 vec;
1697
    NEON_UNPACK(neon_s16, vec, x);
1698
    DO_QABS16(vec.v1);
1699
    DO_QABS16(vec.v2);
1700
    NEON_PACK(neon_s16, x, vec);
1701
    return x;
1702
}
1703
#undef DO_QABS16
1704

    
1705
#define DO_QNEG16(x) do { \
1706
    if (x == (int16_t)0x8000) { \
1707
        x = 0x7fff; \
1708
        SET_QC(); \
1709
    } else { \
1710
        x = -x; \
1711
    }} while (0)
1712
uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
1713
{
1714
    neon_s16 vec;
1715
    NEON_UNPACK(neon_s16, vec, x);
1716
    DO_QNEG16(vec.v1);
1717
    DO_QNEG16(vec.v2);
1718
    NEON_PACK(neon_s16, x, vec);
1719
    return x;
1720
}
1721
#undef DO_QNEG16
1722

    
1723
uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
1724
{
1725
    if (x == SIGNBIT) {
1726
        SET_QC();
1727
        x = ~SIGNBIT;
1728
    } else if ((int32_t)x < 0) {
1729
        x = -x;
1730
    }
1731
    return x;
1732
}
1733

    
1734
uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
1735
{
1736
    if (x == SIGNBIT) {
1737
        SET_QC();
1738
        x = ~SIGNBIT;
1739
    } else {
1740
        x = -x;
1741
    }
1742
    return x;
1743
}
1744

    
1745
/* NEON Float helpers.  */
1746
uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
1747
{
1748
    float32 f0 = vfp_itos(a);
1749
    float32 f1 = vfp_itos(b);
1750
    return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
1751
}
1752

    
1753
uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
1754
{
1755
    float32 f0 = vfp_itos(a);
1756
    float32 f1 = vfp_itos(b);
1757
    return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
1758
}
1759

    
1760
uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
1761
{
1762
    float32 f0 = vfp_itos(a);
1763
    float32 f1 = vfp_itos(b);
1764
    return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
1765
                    ? float32_sub(f0, f1, NFS)
1766
                    : float32_sub(f1, f0, NFS));
1767
}
1768

    
1769
uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
1770
{
1771
    return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
1772
}
1773

    
1774
uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
1775
{
1776
    return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
1777
}
1778

    
1779
uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
1780
{
1781
    return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
1782
}
1783

    
1784
/* Floating point comparisons produce an integer result.  */
1785
#define NEON_VOP_FCMP(name, cmp) \
1786
uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
1787
{ \
1788
    if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
1789
        return ~0; \
1790
    else \
1791
        return 0; \
1792
}
1793

    
1794
NEON_VOP_FCMP(ceq_f32, ==)
1795
NEON_VOP_FCMP(cge_f32, >=)
1796
NEON_VOP_FCMP(cgt_f32, >)
1797

    
1798
uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
1799
{
1800
    float32 f0 = float32_abs(vfp_itos(a));
1801
    float32 f1 = float32_abs(vfp_itos(b));
1802
    return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
1803
}
1804

    
1805
uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
1806
{
1807
    float32 f0 = float32_abs(vfp_itos(a));
1808
    float32 f1 = float32_abs(vfp_itos(b));
1809
    return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
1810
}
1811

    
1812
#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1813

    
1814
void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)
1815
{
1816
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1817
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1818
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1819
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1820
    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1821
        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1822
        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1823
        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1824
    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1825
        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1826
        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1827
        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1828
    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1829
        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1830
        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1831
        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1832
    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1833
        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1834
        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1835
        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1836
    env->vfp.regs[rm] = make_float64(m0);
1837
    env->vfp.regs[rm + 1] = make_float64(m1);
1838
    env->vfp.regs[rd] = make_float64(d0);
1839
    env->vfp.regs[rd + 1] = make_float64(d1);
1840
}
1841

    
1842
void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)
1843
{
1844
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1845
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1846
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1847
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1848
    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1849
        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1850
    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1851
        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1852
    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1853
        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1854
    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1855
        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1856
    env->vfp.regs[rm] = make_float64(m0);
1857
    env->vfp.regs[rm + 1] = make_float64(m1);
1858
    env->vfp.regs[rd] = make_float64(d0);
1859
    env->vfp.regs[rd + 1] = make_float64(d1);
1860
}
1861

    
1862
void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)
1863
{
1864
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1865
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1866
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1867
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1868
    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1869
    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1870
    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1871
    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1872
    env->vfp.regs[rm] = make_float64(m0);
1873
    env->vfp.regs[rm + 1] = make_float64(m1);
1874
    env->vfp.regs[rd] = make_float64(d0);
1875
    env->vfp.regs[rd + 1] = make_float64(d1);
1876
}
1877

    
1878
void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)
1879
{
1880
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1881
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1882
    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1883
        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1884
        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1885
        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1886
    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1887
        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1888
        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1889
        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1890
    env->vfp.regs[rm] = make_float64(m0);
1891
    env->vfp.regs[rd] = make_float64(d0);
1892
}
1893

    
1894
void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
1895
{
1896
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1897
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1898
    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1899
        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1900
    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1901
        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1902
    env->vfp.regs[rm] = make_float64(m0);
1903
    env->vfp.regs[rd] = make_float64(d0);
1904
}
1905

    
1906
void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm)
1907
{
1908
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1909
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1910
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1911
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1912
    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1913
        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1914
        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1915
        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1916
    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1917
        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1918
        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1919
        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1920
    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1921
        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1922
        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1923
        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1924
    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1925
        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1926
        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1927
        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1928
    env->vfp.regs[rm] = make_float64(m0);
1929
    env->vfp.regs[rm + 1] = make_float64(m1);
1930
    env->vfp.regs[rd] = make_float64(d0);
1931
    env->vfp.regs[rd + 1] = make_float64(d1);
1932
}
1933

    
1934
void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm)
1935
{
1936
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1937
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1938
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1939
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1940
    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1941
        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1942
    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1943
        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1944
    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1945
        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1946
    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1947
        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1948
    env->vfp.regs[rm] = make_float64(m0);
1949
    env->vfp.regs[rm + 1] = make_float64(m1);
1950
    env->vfp.regs[rd] = make_float64(d0);
1951
    env->vfp.regs[rd + 1] = make_float64(d1);
1952
}
1953

    
1954
void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm)
1955
{
1956
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1957
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1958
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1959
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1960
    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1961
    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1962
    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1963
    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1964
    env->vfp.regs[rm] = make_float64(m0);
1965
    env->vfp.regs[rm + 1] = make_float64(m1);
1966
    env->vfp.regs[rd] = make_float64(d0);
1967
    env->vfp.regs[rd + 1] = make_float64(d1);
1968
}
1969

    
1970
void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm)
1971
{
1972
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1973
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1974
    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1975
        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1976
        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1977
        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1978
    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1979
        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1980
        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1981
        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1982
    env->vfp.regs[rm] = make_float64(m0);
1983
    env->vfp.regs[rd] = make_float64(d0);
1984
}
1985

    
1986
void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm)
1987
{
1988
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1989
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1990
    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1991
        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1992
    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1993
        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1994
    env->vfp.regs[rm] = make_float64(m0);
1995
    env->vfp.regs[rd] = make_float64(d0);
1996
}