Statistics
| Branch: | Revision:

root / target-arm / neon_helper.c @ 0ecb72a5

History | View | Annotate | Download (52.9 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007, 2008 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licensed under the GNU GPL v2.
8
 */
9
#include <stdlib.h>
10
#include <stdio.h>
11

    
12
#include "cpu.h"
13
#include "exec-all.h"
14
#include "helper.h"
15

    
16
#define SIGNBIT (uint32_t)0x80000000
17
#define SIGNBIT64 ((uint64_t)1 << 63)
18

    
19
#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
20

    
21
#define NEON_TYPE1(name, type) \
22
typedef struct \
23
{ \
24
    type v1; \
25
} neon_##name;
26
#ifdef HOST_WORDS_BIGENDIAN
27
#define NEON_TYPE2(name, type) \
28
typedef struct \
29
{ \
30
    type v2; \
31
    type v1; \
32
} neon_##name;
33
#define NEON_TYPE4(name, type) \
34
typedef struct \
35
{ \
36
    type v4; \
37
    type v3; \
38
    type v2; \
39
    type v1; \
40
} neon_##name;
41
#else
42
#define NEON_TYPE2(name, type) \
43
typedef struct \
44
{ \
45
    type v1; \
46
    type v2; \
47
} neon_##name;
48
#define NEON_TYPE4(name, type) \
49
typedef struct \
50
{ \
51
    type v1; \
52
    type v2; \
53
    type v3; \
54
    type v4; \
55
} neon_##name;
56
#endif
57

    
58
NEON_TYPE4(s8, int8_t)
59
NEON_TYPE4(u8, uint8_t)
60
NEON_TYPE2(s16, int16_t)
61
NEON_TYPE2(u16, uint16_t)
62
NEON_TYPE1(s32, int32_t)
63
NEON_TYPE1(u32, uint32_t)
64
#undef NEON_TYPE4
65
#undef NEON_TYPE2
66
#undef NEON_TYPE1
67

    
68
/* Copy from a uint32_t to a vector structure type.  */
69
#define NEON_UNPACK(vtype, dest, val) do { \
70
    union { \
71
        vtype v; \
72
        uint32_t i; \
73
    } conv_u; \
74
    conv_u.i = (val); \
75
    dest = conv_u.v; \
76
    } while(0)
77

    
78
/* Copy from a vector structure type to a uint32_t.  */
79
#define NEON_PACK(vtype, dest, val) do { \
80
    union { \
81
        vtype v; \
82
        uint32_t i; \
83
    } conv_u; \
84
    conv_u.v = (val); \
85
    dest = conv_u.i; \
86
    } while(0)
87

    
88
#define NEON_DO1 \
89
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
90
#define NEON_DO2 \
91
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
92
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
93
#define NEON_DO4 \
94
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
96
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
97
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
98

    
99
#define NEON_VOP_BODY(vtype, n) \
100
{ \
101
    uint32_t res; \
102
    vtype vsrc1; \
103
    vtype vsrc2; \
104
    vtype vdest; \
105
    NEON_UNPACK(vtype, vsrc1, arg1); \
106
    NEON_UNPACK(vtype, vsrc2, arg2); \
107
    NEON_DO##n; \
108
    NEON_PACK(vtype, res, vdest); \
109
    return res; \
110
}
111

    
112
#define NEON_VOP(name, vtype, n) \
113
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
114
NEON_VOP_BODY(vtype, n)
115

    
116
#define NEON_VOP_ENV(name, vtype, n) \
117
uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
118
NEON_VOP_BODY(vtype, n)
119

    
120
/* Pairwise operations.  */
121
/* For 32-bit elements each segment only contains a single element, so
122
   the elementwise and pairwise operations are the same.  */
123
#define NEON_PDO2 \
124
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
125
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
126
#define NEON_PDO4 \
127
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
128
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
129
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
130
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
131

    
132
#define NEON_POP(name, vtype, n) \
133
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
134
{ \
135
    uint32_t res; \
136
    vtype vsrc1; \
137
    vtype vsrc2; \
138
    vtype vdest; \
139
    NEON_UNPACK(vtype, vsrc1, arg1); \
140
    NEON_UNPACK(vtype, vsrc2, arg2); \
141
    NEON_PDO##n; \
142
    NEON_PACK(vtype, res, vdest); \
143
    return res; \
144
}
145

    
146
/* Unary operators.  */
147
#define NEON_VOP1(name, vtype, n) \
148
uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
149
{ \
150
    vtype vsrc1; \
151
    vtype vdest; \
152
    NEON_UNPACK(vtype, vsrc1, arg); \
153
    NEON_DO##n; \
154
    NEON_PACK(vtype, arg, vdest); \
155
    return arg; \
156
}
157

    
158

    
159
#define NEON_USAT(dest, src1, src2, type) do { \
160
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
161
    if (tmp != (type)tmp) { \
162
        SET_QC(); \
163
        dest = ~0; \
164
    } else { \
165
        dest = tmp; \
166
    }} while(0)
167
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
168
NEON_VOP_ENV(qadd_u8, neon_u8, 4)
169
#undef NEON_FN
170
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
171
NEON_VOP_ENV(qadd_u16, neon_u16, 2)
172
#undef NEON_FN
173
#undef NEON_USAT
174

    
175
uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
176
{
177
    uint32_t res = a + b;
178
    if (res < a) {
179
        SET_QC();
180
        res = ~0;
181
    }
182
    return res;
183
}
184

    
185
uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
186
{
187
    uint64_t res;
188

    
189
    res = src1 + src2;
190
    if (res < src1) {
191
        SET_QC();
192
        res = ~(uint64_t)0;
193
    }
194
    return res;
195
}
196

    
197
#define NEON_SSAT(dest, src1, src2, type) do { \
198
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
199
    if (tmp != (type)tmp) { \
200
        SET_QC(); \
201
        if (src2 > 0) { \
202
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
203
        } else { \
204
            tmp = 1 << (sizeof(type) * 8 - 1); \
205
        } \
206
    } \
207
    dest = tmp; \
208
    } while(0)
209
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
210
NEON_VOP_ENV(qadd_s8, neon_s8, 4)
211
#undef NEON_FN
212
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
213
NEON_VOP_ENV(qadd_s16, neon_s16, 2)
214
#undef NEON_FN
215
#undef NEON_SSAT
216

    
217
uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
218
{
219
    uint32_t res = a + b;
220
    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
221
        SET_QC();
222
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
223
    }
224
    return res;
225
}
226

    
227
uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
228
{
229
    uint64_t res;
230

    
231
    res = src1 + src2;
232
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
233
        SET_QC();
234
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
235
    }
236
    return res;
237
}
238

    
239
#define NEON_USAT(dest, src1, src2, type) do { \
240
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
241
    if (tmp != (type)tmp) { \
242
        SET_QC(); \
243
        dest = 0; \
244
    } else { \
245
        dest = tmp; \
246
    }} while(0)
247
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
248
NEON_VOP_ENV(qsub_u8, neon_u8, 4)
249
#undef NEON_FN
250
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
251
NEON_VOP_ENV(qsub_u16, neon_u16, 2)
252
#undef NEON_FN
253
#undef NEON_USAT
254

    
255
uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
256
{
257
    uint32_t res = a - b;
258
    if (res > a) {
259
        SET_QC();
260
        res = 0;
261
    }
262
    return res;
263
}
264

    
265
uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
266
{
267
    uint64_t res;
268

    
269
    if (src1 < src2) {
270
        SET_QC();
271
        res = 0;
272
    } else {
273
        res = src1 - src2;
274
    }
275
    return res;
276
}
277

    
278
#define NEON_SSAT(dest, src1, src2, type) do { \
279
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
280
    if (tmp != (type)tmp) { \
281
        SET_QC(); \
282
        if (src2 < 0) { \
283
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
284
        } else { \
285
            tmp = 1 << (sizeof(type) * 8 - 1); \
286
        } \
287
    } \
288
    dest = tmp; \
289
    } while(0)
290
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
291
NEON_VOP_ENV(qsub_s8, neon_s8, 4)
292
#undef NEON_FN
293
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
294
NEON_VOP_ENV(qsub_s16, neon_s16, 2)
295
#undef NEON_FN
296
#undef NEON_SSAT
297

    
298
uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
299
{
300
    uint32_t res = a - b;
301
    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
302
        SET_QC();
303
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
304
    }
305
    return res;
306
}
307

    
308
uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
309
{
310
    uint64_t res;
311

    
312
    res = src1 - src2;
313
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
314
        SET_QC();
315
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
316
    }
317
    return res;
318
}
319

    
320
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
321
NEON_VOP(hadd_s8, neon_s8, 4)
322
NEON_VOP(hadd_u8, neon_u8, 4)
323
NEON_VOP(hadd_s16, neon_s16, 2)
324
NEON_VOP(hadd_u16, neon_u16, 2)
325
#undef NEON_FN
326

    
327
int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
328
{
329
    int32_t dest;
330

    
331
    dest = (src1 >> 1) + (src2 >> 1);
332
    if (src1 & src2 & 1)
333
        dest++;
334
    return dest;
335
}
336

    
337
uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
338
{
339
    uint32_t dest;
340

    
341
    dest = (src1 >> 1) + (src2 >> 1);
342
    if (src1 & src2 & 1)
343
        dest++;
344
    return dest;
345
}
346

    
347
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
348
NEON_VOP(rhadd_s8, neon_s8, 4)
349
NEON_VOP(rhadd_u8, neon_u8, 4)
350
NEON_VOP(rhadd_s16, neon_s16, 2)
351
NEON_VOP(rhadd_u16, neon_u16, 2)
352
#undef NEON_FN
353

    
354
int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
355
{
356
    int32_t dest;
357

    
358
    dest = (src1 >> 1) + (src2 >> 1);
359
    if ((src1 | src2) & 1)
360
        dest++;
361
    return dest;
362
}
363

    
364
uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
365
{
366
    uint32_t dest;
367

    
368
    dest = (src1 >> 1) + (src2 >> 1);
369
    if ((src1 | src2) & 1)
370
        dest++;
371
    return dest;
372
}
373

    
374
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
375
NEON_VOP(hsub_s8, neon_s8, 4)
376
NEON_VOP(hsub_u8, neon_u8, 4)
377
NEON_VOP(hsub_s16, neon_s16, 2)
378
NEON_VOP(hsub_u16, neon_u16, 2)
379
#undef NEON_FN
380

    
381
int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
382
{
383
    int32_t dest;
384

    
385
    dest = (src1 >> 1) - (src2 >> 1);
386
    if ((~src1) & src2 & 1)
387
        dest--;
388
    return dest;
389
}
390

    
391
uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
392
{
393
    uint32_t dest;
394

    
395
    dest = (src1 >> 1) - (src2 >> 1);
396
    if ((~src1) & src2 & 1)
397
        dest--;
398
    return dest;
399
}
400

    
401
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
402
NEON_VOP(cgt_s8, neon_s8, 4)
403
NEON_VOP(cgt_u8, neon_u8, 4)
404
NEON_VOP(cgt_s16, neon_s16, 2)
405
NEON_VOP(cgt_u16, neon_u16, 2)
406
NEON_VOP(cgt_s32, neon_s32, 1)
407
NEON_VOP(cgt_u32, neon_u32, 1)
408
#undef NEON_FN
409

    
410
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
411
NEON_VOP(cge_s8, neon_s8, 4)
412
NEON_VOP(cge_u8, neon_u8, 4)
413
NEON_VOP(cge_s16, neon_s16, 2)
414
NEON_VOP(cge_u16, neon_u16, 2)
415
NEON_VOP(cge_s32, neon_s32, 1)
416
NEON_VOP(cge_u32, neon_u32, 1)
417
#undef NEON_FN
418

    
419
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
420
NEON_VOP(min_s8, neon_s8, 4)
421
NEON_VOP(min_u8, neon_u8, 4)
422
NEON_VOP(min_s16, neon_s16, 2)
423
NEON_VOP(min_u16, neon_u16, 2)
424
NEON_VOP(min_s32, neon_s32, 1)
425
NEON_VOP(min_u32, neon_u32, 1)
426
NEON_POP(pmin_s8, neon_s8, 4)
427
NEON_POP(pmin_u8, neon_u8, 4)
428
NEON_POP(pmin_s16, neon_s16, 2)
429
NEON_POP(pmin_u16, neon_u16, 2)
430
#undef NEON_FN
431

    
432
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
433
NEON_VOP(max_s8, neon_s8, 4)
434
NEON_VOP(max_u8, neon_u8, 4)
435
NEON_VOP(max_s16, neon_s16, 2)
436
NEON_VOP(max_u16, neon_u16, 2)
437
NEON_VOP(max_s32, neon_s32, 1)
438
NEON_VOP(max_u32, neon_u32, 1)
439
NEON_POP(pmax_s8, neon_s8, 4)
440
NEON_POP(pmax_u8, neon_u8, 4)
441
NEON_POP(pmax_s16, neon_s16, 2)
442
NEON_POP(pmax_u16, neon_u16, 2)
443
#undef NEON_FN
444

    
445
#define NEON_FN(dest, src1, src2) \
446
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
447
NEON_VOP(abd_s8, neon_s8, 4)
448
NEON_VOP(abd_u8, neon_u8, 4)
449
NEON_VOP(abd_s16, neon_s16, 2)
450
NEON_VOP(abd_u16, neon_u16, 2)
451
NEON_VOP(abd_s32, neon_s32, 1)
452
NEON_VOP(abd_u32, neon_u32, 1)
453
#undef NEON_FN
454

    
455
#define NEON_FN(dest, src1, src2) do { \
456
    int8_t tmp; \
457
    tmp = (int8_t)src2; \
458
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
459
        tmp <= -(ssize_t)sizeof(src1) * 8) { \
460
        dest = 0; \
461
    } else if (tmp < 0) { \
462
        dest = src1 >> -tmp; \
463
    } else { \
464
        dest = src1 << tmp; \
465
    }} while (0)
466
NEON_VOP(shl_u8, neon_u8, 4)
467
NEON_VOP(shl_u16, neon_u16, 2)
468
NEON_VOP(shl_u32, neon_u32, 1)
469
#undef NEON_FN
470

    
471
uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
472
{
473
    int8_t shift = (int8_t)shiftop;
474
    if (shift >= 64 || shift <= -64) {
475
        val = 0;
476
    } else if (shift < 0) {
477
        val >>= -shift;
478
    } else {
479
        val <<= shift;
480
    }
481
    return val;
482
}
483

    
484
#define NEON_FN(dest, src1, src2) do { \
485
    int8_t tmp; \
486
    tmp = (int8_t)src2; \
487
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
488
        dest = 0; \
489
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
490
        dest = src1 >> (sizeof(src1) * 8 - 1); \
491
    } else if (tmp < 0) { \
492
        dest = src1 >> -tmp; \
493
    } else { \
494
        dest = src1 << tmp; \
495
    }} while (0)
496
NEON_VOP(shl_s8, neon_s8, 4)
497
NEON_VOP(shl_s16, neon_s16, 2)
498
NEON_VOP(shl_s32, neon_s32, 1)
499
#undef NEON_FN
500

    
501
uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
502
{
503
    int8_t shift = (int8_t)shiftop;
504
    int64_t val = valop;
505
    if (shift >= 64) {
506
        val = 0;
507
    } else if (shift <= -64) {
508
        val >>= 63;
509
    } else if (shift < 0) {
510
        val >>= -shift;
511
    } else {
512
        val <<= shift;
513
    }
514
    return val;
515
}
516

    
517
#define NEON_FN(dest, src1, src2) do { \
518
    int8_t tmp; \
519
    tmp = (int8_t)src2; \
520
    if ((tmp >= (ssize_t)sizeof(src1) * 8) \
521
        || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
522
        dest = 0; \
523
    } else if (tmp < 0) { \
524
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
525
    } else { \
526
        dest = src1 << tmp; \
527
    }} while (0)
528
NEON_VOP(rshl_s8, neon_s8, 4)
529
NEON_VOP(rshl_s16, neon_s16, 2)
530
#undef NEON_FN
531

    
532
/* The addition of the rounding constant may overflow, so we use an
533
 * intermediate 64 bits accumulator.  */
534
uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
535
{
536
    int32_t dest;
537
    int32_t val = (int32_t)valop;
538
    int8_t shift = (int8_t)shiftop;
539
    if ((shift >= 32) || (shift <= -32)) {
540
        dest = 0;
541
    } else if (shift < 0) {
542
        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
543
        dest = big_dest >> -shift;
544
    } else {
545
        dest = val << shift;
546
    }
547
    return dest;
548
}
549

    
550
/* Handling addition overflow with 64 bits inputs values is more
551
 * tricky than with 32 bits values.  */
552
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
553
{
554
    int8_t shift = (int8_t)shiftop;
555
    int64_t val = valop;
556
    if ((shift >= 64) || (shift <= -64)) {
557
        val = 0;
558
    } else if (shift < 0) {
559
        val >>= (-shift - 1);
560
        if (val == INT64_MAX) {
561
            /* In this case, it means that the rounding constant is 1,
562
             * and the addition would overflow. Return the actual
563
             * result directly.  */
564
            val = 0x4000000000000000LL;
565
        } else {
566
            val++;
567
            val >>= 1;
568
        }
569
    } else {
570
        val <<= shift;
571
    }
572
    return val;
573
}
574

    
575
#define NEON_FN(dest, src1, src2) do { \
576
    int8_t tmp; \
577
    tmp = (int8_t)src2; \
578
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
579
        tmp < -(ssize_t)sizeof(src1) * 8) { \
580
        dest = 0; \
581
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
582
        dest = src1 >> (-tmp - 1); \
583
    } else if (tmp < 0) { \
584
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
585
    } else { \
586
        dest = src1 << tmp; \
587
    }} while (0)
588
NEON_VOP(rshl_u8, neon_u8, 4)
589
NEON_VOP(rshl_u16, neon_u16, 2)
590
#undef NEON_FN
591

    
592
/* The addition of the rounding constant may overflow, so we use an
593
 * intermediate 64 bits accumulator.  */
594
uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
595
{
596
    uint32_t dest;
597
    int8_t shift = (int8_t)shiftop;
598
    if (shift >= 32 || shift < -32) {
599
        dest = 0;
600
    } else if (shift == -32) {
601
        dest = val >> 31;
602
    } else if (shift < 0) {
603
        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
604
        dest = big_dest >> -shift;
605
    } else {
606
        dest = val << shift;
607
    }
608
    return dest;
609
}
610

    
611
/* Handling addition overflow with 64 bits inputs values is more
612
 * tricky than with 32 bits values.  */
613
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
614
{
615
    int8_t shift = (uint8_t)shiftop;
616
    if (shift >= 64 || shift < -64) {
617
        val = 0;
618
    } else if (shift == -64) {
619
        /* Rounding a 1-bit result just preserves that bit.  */
620
        val >>= 63;
621
    } else if (shift < 0) {
622
        val >>= (-shift - 1);
623
        if (val == UINT64_MAX) {
624
            /* In this case, it means that the rounding constant is 1,
625
             * and the addition would overflow. Return the actual
626
             * result directly.  */
627
            val = 0x8000000000000000ULL;
628
        } else {
629
            val++;
630
            val >>= 1;
631
        }
632
    } else {
633
        val <<= shift;
634
    }
635
    return val;
636
}
637

    
638
#define NEON_FN(dest, src1, src2) do { \
639
    int8_t tmp; \
640
    tmp = (int8_t)src2; \
641
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
642
        if (src1) { \
643
            SET_QC(); \
644
            dest = ~0; \
645
        } else { \
646
            dest = 0; \
647
        } \
648
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
649
        dest = 0; \
650
    } else if (tmp < 0) { \
651
        dest = src1 >> -tmp; \
652
    } else { \
653
        dest = src1 << tmp; \
654
        if ((dest >> tmp) != src1) { \
655
            SET_QC(); \
656
            dest = ~0; \
657
        } \
658
    }} while (0)
659
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
660
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
661
NEON_VOP_ENV(qshl_u32, neon_u32, 1)
662
#undef NEON_FN
663

    
664
uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
665
{
666
    int8_t shift = (int8_t)shiftop;
667
    if (shift >= 64) {
668
        if (val) {
669
            val = ~(uint64_t)0;
670
            SET_QC();
671
        }
672
    } else if (shift <= -64) {
673
        val = 0;
674
    } else if (shift < 0) {
675
        val >>= -shift;
676
    } else {
677
        uint64_t tmp = val;
678
        val <<= shift;
679
        if ((val >> shift) != tmp) {
680
            SET_QC();
681
            val = ~(uint64_t)0;
682
        }
683
    }
684
    return val;
685
}
686

    
687
#define NEON_FN(dest, src1, src2) do { \
688
    int8_t tmp; \
689
    tmp = (int8_t)src2; \
690
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
691
        if (src1) { \
692
            SET_QC(); \
693
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
694
            if (src1 > 0) { \
695
                dest--; \
696
            } \
697
        } else { \
698
            dest = src1; \
699
        } \
700
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
701
        dest = src1 >> 31; \
702
    } else if (tmp < 0) { \
703
        dest = src1 >> -tmp; \
704
    } else { \
705
        dest = src1 << tmp; \
706
        if ((dest >> tmp) != src1) { \
707
            SET_QC(); \
708
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
709
            if (src1 > 0) { \
710
                dest--; \
711
            } \
712
        } \
713
    }} while (0)
714
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
715
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
716
NEON_VOP_ENV(qshl_s32, neon_s32, 1)
717
#undef NEON_FN
718

    
719
uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
720
{
721
    int8_t shift = (uint8_t)shiftop;
722
    int64_t val = valop;
723
    if (shift >= 64) {
724
        if (val) {
725
            SET_QC();
726
            val = (val >> 63) ^ ~SIGNBIT64;
727
        }
728
    } else if (shift <= -64) {
729
        val >>= 63;
730
    } else if (shift < 0) {
731
        val >>= -shift;
732
    } else {
733
        int64_t tmp = val;
734
        val <<= shift;
735
        if ((val >> shift) != tmp) {
736
            SET_QC();
737
            val = (tmp >> 63) ^ ~SIGNBIT64;
738
        }
739
    }
740
    return val;
741
}
742

    
743
#define NEON_FN(dest, src1, src2) do { \
744
    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
745
        SET_QC(); \
746
        dest = 0; \
747
    } else { \
748
        int8_t tmp; \
749
        tmp = (int8_t)src2; \
750
        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
751
            if (src1) { \
752
                SET_QC(); \
753
                dest = ~0; \
754
            } else { \
755
                dest = 0; \
756
            } \
757
        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
758
            dest = 0; \
759
        } else if (tmp < 0) { \
760
            dest = src1 >> -tmp; \
761
        } else { \
762
            dest = src1 << tmp; \
763
            if ((dest >> tmp) != src1) { \
764
                SET_QC(); \
765
                dest = ~0; \
766
            } \
767
        } \
768
    }} while (0)
769
NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
770
NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
771
#undef NEON_FN
772

    
773
uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
774
{
775
    if ((int32_t)valop < 0) {
776
        SET_QC();
777
        return 0;
778
    }
779
    return helper_neon_qshl_u32(env, valop, shiftop);
780
}
781

    
782
uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
783
{
784
    if ((int64_t)valop < 0) {
785
        SET_QC();
786
        return 0;
787
    }
788
    return helper_neon_qshl_u64(env, valop, shiftop);
789
}
790

    
791
/* FIXME: This is wrong.  */
792
#define NEON_FN(dest, src1, src2) do { \
793
    int8_t tmp; \
794
    tmp = (int8_t)src2; \
795
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
796
        if (src1) { \
797
            SET_QC(); \
798
            dest = ~0; \
799
        } else { \
800
            dest = 0; \
801
        } \
802
    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
803
        dest = 0; \
804
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
805
        dest = src1 >> (sizeof(src1) * 8 - 1); \
806
    } else if (tmp < 0) { \
807
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
808
    } else { \
809
        dest = src1 << tmp; \
810
        if ((dest >> tmp) != src1) { \
811
            SET_QC(); \
812
            dest = ~0; \
813
        } \
814
    }} while (0)
815
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
816
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
817
#undef NEON_FN
818

    
819
/* The addition of the rounding constant may overflow, so we use an
820
 * intermediate 64 bits accumulator.  */
821
uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop)
822
{
823
    uint32_t dest;
824
    int8_t shift = (int8_t)shiftop;
825
    if (shift >= 32) {
826
        if (val) {
827
            SET_QC();
828
            dest = ~0;
829
        } else {
830
            dest = 0;
831
        }
832
    } else if (shift < -32) {
833
        dest = 0;
834
    } else if (shift == -32) {
835
        dest = val >> 31;
836
    } else if (shift < 0) {
837
        uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
838
        dest = big_dest >> -shift;
839
    } else {
840
        dest = val << shift;
841
        if ((dest >> shift) != val) {
842
            SET_QC();
843
            dest = ~0;
844
        }
845
    }
846
    return dest;
847
}
848

    
849
/* Handling addition overflow with 64 bits inputs values is more
850
 * tricky than with 32 bits values.  */
851
uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
852
{
853
    int8_t shift = (int8_t)shiftop;
854
    if (shift >= 64) {
855
        if (val) {
856
            SET_QC();
857
            val = ~0;
858
        }
859
    } else if (shift < -64) {
860
        val = 0;
861
    } else if (shift == -64) {
862
        val >>= 63;
863
    } else if (shift < 0) {
864
        val >>= (-shift - 1);
865
        if (val == UINT64_MAX) {
866
            /* In this case, it means that the rounding constant is 1,
867
             * and the addition would overflow. Return the actual
868
             * result directly.  */
869
            val = 0x8000000000000000ULL;
870
        } else {
871
            val++;
872
            val >>= 1;
873
        }
874
    } else { \
875
        uint64_t tmp = val;
876
        val <<= shift;
877
        if ((val >> shift) != tmp) {
878
            SET_QC();
879
            val = ~0;
880
        }
881
    }
882
    return val;
883
}
884

    
885
#define NEON_FN(dest, src1, src2) do { \
886
    int8_t tmp; \
887
    tmp = (int8_t)src2; \
888
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
889
        if (src1) { \
890
            SET_QC(); \
891
            dest = (1 << (sizeof(src1) * 8 - 1)); \
892
            if (src1 > 0) { \
893
                dest--; \
894
            } \
895
        } else { \
896
            dest = 0; \
897
        } \
898
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
899
        dest = 0; \
900
    } else if (tmp < 0) { \
901
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
902
    } else { \
903
        dest = src1 << tmp; \
904
        if ((dest >> tmp) != src1) { \
905
            SET_QC(); \
906
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
907
            if (src1 > 0) { \
908
                dest--; \
909
            } \
910
        } \
911
    }} while (0)
912
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
913
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
914
#undef NEON_FN
915

    
916
/* The addition of the rounding constant may overflow, so we use an
917
 * intermediate 64 bits accumulator.  */
918
uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
919
{
920
    int32_t dest;
921
    int32_t val = (int32_t)valop;
922
    int8_t shift = (int8_t)shiftop;
923
    if (shift >= 32) {
924
        if (val) {
925
            SET_QC();
926
            dest = (val >> 31) ^ ~SIGNBIT;
927
        } else {
928
            dest = 0;
929
        }
930
    } else if (shift <= -32) {
931
        dest = 0;
932
    } else if (shift < 0) {
933
        int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
934
        dest = big_dest >> -shift;
935
    } else {
936
        dest = val << shift;
937
        if ((dest >> shift) != val) {
938
            SET_QC();
939
            dest = (val >> 31) ^ ~SIGNBIT;
940
        }
941
    }
942
    return dest;
943
}
944

    
945
/* Handling addition overflow with 64 bits inputs values is more
946
 * tricky than with 32 bits values.  */
947
uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
948
{
949
    int8_t shift = (uint8_t)shiftop;
950
    int64_t val = valop;
951

    
952
    if (shift >= 64) {
953
        if (val) {
954
            SET_QC();
955
            val = (val >> 63) ^ ~SIGNBIT64;
956
        }
957
    } else if (shift <= -64) {
958
        val = 0;
959
    } else if (shift < 0) {
960
        val >>= (-shift - 1);
961
        if (val == INT64_MAX) {
962
            /* In this case, it means that the rounding constant is 1,
963
             * and the addition would overflow. Return the actual
964
             * result directly.  */
965
            val = 0x4000000000000000ULL;
966
        } else {
967
            val++;
968
            val >>= 1;
969
        }
970
    } else {
971
        int64_t tmp = val;
972
        val <<= shift;
973
        if ((val >> shift) != tmp) {
974
            SET_QC();
975
            val = (tmp >> 63) ^ ~SIGNBIT64;
976
        }
977
    }
978
    return val;
979
}
980

    
981
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
982
{
983
    uint32_t mask;
984
    mask = (a ^ b) & 0x80808080u;
985
    a &= ~0x80808080u;
986
    b &= ~0x80808080u;
987
    return (a + b) ^ mask;
988
}
989

    
990
uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
991
{
992
    uint32_t mask;
993
    mask = (a ^ b) & 0x80008000u;
994
    a &= ~0x80008000u;
995
    b &= ~0x80008000u;
996
    return (a + b) ^ mask;
997
}
998

    
999
#define NEON_FN(dest, src1, src2) dest = src1 + src2
1000
NEON_POP(padd_u8, neon_u8, 4)
1001
NEON_POP(padd_u16, neon_u16, 2)
1002
#undef NEON_FN
1003

    
1004
#define NEON_FN(dest, src1, src2) dest = src1 - src2
1005
NEON_VOP(sub_u8, neon_u8, 4)
1006
NEON_VOP(sub_u16, neon_u16, 2)
1007
#undef NEON_FN
1008

    
1009
#define NEON_FN(dest, src1, src2) dest = src1 * src2
1010
NEON_VOP(mul_u8, neon_u8, 4)
1011
NEON_VOP(mul_u16, neon_u16, 2)
1012
#undef NEON_FN
1013

    
1014
/* Polynomial multiplication is like integer multiplication except the
1015
   partial products are XORed, not added.  */
1016
uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
1017
{
1018
    uint32_t mask;
1019
    uint32_t result;
1020
    result = 0;
1021
    while (op1) {
1022
        mask = 0;
1023
        if (op1 & 1)
1024
            mask |= 0xff;
1025
        if (op1 & (1 << 8))
1026
            mask |= (0xff << 8);
1027
        if (op1 & (1 << 16))
1028
            mask |= (0xff << 16);
1029
        if (op1 & (1 << 24))
1030
            mask |= (0xff << 24);
1031
        result ^= op2 & mask;
1032
        op1 = (op1 >> 1) & 0x7f7f7f7f;
1033
        op2 = (op2 << 1) & 0xfefefefe;
1034
    }
1035
    return result;
1036
}
1037

    
1038
uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
1039
{
1040
    uint64_t result = 0;
1041
    uint64_t mask;
1042
    uint64_t op2ex = op2;
1043
    op2ex = (op2ex & 0xff) |
1044
        ((op2ex & 0xff00) << 8) |
1045
        ((op2ex & 0xff0000) << 16) |
1046
        ((op2ex & 0xff000000) << 24);
1047
    while (op1) {
1048
        mask = 0;
1049
        if (op1 & 1) {
1050
            mask |= 0xffff;
1051
        }
1052
        if (op1 & (1 << 8)) {
1053
            mask |= (0xffffU << 16);
1054
        }
1055
        if (op1 & (1 << 16)) {
1056
            mask |= (0xffffULL << 32);
1057
        }
1058
        if (op1 & (1 << 24)) {
1059
            mask |= (0xffffULL << 48);
1060
        }
1061
        result ^= op2ex & mask;
1062
        op1 = (op1 >> 1) & 0x7f7f7f7f;
1063
        op2ex <<= 1;
1064
    }
1065
    return result;
1066
}
1067

    
1068
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
1069
NEON_VOP(tst_u8, neon_u8, 4)
1070
NEON_VOP(tst_u16, neon_u16, 2)
1071
NEON_VOP(tst_u32, neon_u32, 1)
1072
#undef NEON_FN
1073

    
1074
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
1075
NEON_VOP(ceq_u8, neon_u8, 4)
1076
NEON_VOP(ceq_u16, neon_u16, 2)
1077
NEON_VOP(ceq_u32, neon_u32, 1)
1078
#undef NEON_FN
1079

    
1080
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1081
NEON_VOP1(abs_s8, neon_s8, 4)
1082
NEON_VOP1(abs_s16, neon_s16, 2)
1083
#undef NEON_FN
1084

    
1085
/* Count Leading Sign/Zero Bits.  */
1086
static inline int do_clz8(uint8_t x)
1087
{
1088
    int n;
1089
    for (n = 8; x; n--)
1090
        x >>= 1;
1091
    return n;
1092
}
1093

    
1094
static inline int do_clz16(uint16_t x)
1095
{
1096
    int n;
1097
    for (n = 16; x; n--)
1098
        x >>= 1;
1099
    return n;
1100
}
1101

    
1102
#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
1103
NEON_VOP1(clz_u8, neon_u8, 4)
1104
#undef NEON_FN
1105

    
1106
#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
1107
NEON_VOP1(clz_u16, neon_u16, 2)
1108
#undef NEON_FN
1109

    
1110
#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
1111
NEON_VOP1(cls_s8, neon_s8, 4)
1112
#undef NEON_FN
1113

    
1114
#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
1115
NEON_VOP1(cls_s16, neon_s16, 2)
1116
#undef NEON_FN
1117

    
1118
uint32_t HELPER(neon_cls_s32)(uint32_t x)
1119
{
1120
    int count;
1121
    if ((int32_t)x < 0)
1122
        x = ~x;
1123
    for (count = 32; x; count--)
1124
        x = x >> 1;
1125
    return count - 1;
1126
}
1127

    
1128
/* Bit count.  */
1129
uint32_t HELPER(neon_cnt_u8)(uint32_t x)
1130
{
1131
    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
1132
    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
1133
    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
1134
    return x;
1135
}
1136

    
1137
#define NEON_QDMULH16(dest, src1, src2, round) do { \
1138
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
1139
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1140
        SET_QC(); \
1141
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1142
    } else { \
1143
        tmp <<= 1; \
1144
    } \
1145
    if (round) { \
1146
        int32_t old = tmp; \
1147
        tmp += 1 << 15; \
1148
        if ((int32_t)tmp < old) { \
1149
            SET_QC(); \
1150
            tmp = SIGNBIT - 1; \
1151
        } \
1152
    } \
1153
    dest = tmp >> 16; \
1154
    } while(0)
1155
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1156
NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
1157
#undef NEON_FN
1158
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1159
NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
1160
#undef NEON_FN
1161
#undef NEON_QDMULH16
1162

    
1163
#define NEON_QDMULH32(dest, src1, src2, round) do { \
1164
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1165
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1166
        SET_QC(); \
1167
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1168
    } else { \
1169
        tmp <<= 1; \
1170
    } \
1171
    if (round) { \
1172
        int64_t old = tmp; \
1173
        tmp += (int64_t)1 << 31; \
1174
        if ((int64_t)tmp < old) { \
1175
            SET_QC(); \
1176
            tmp = SIGNBIT64 - 1; \
1177
        } \
1178
    } \
1179
    dest = tmp >> 32; \
1180
    } while(0)
1181
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1182
NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
1183
#undef NEON_FN
1184
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1185
NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
1186
#undef NEON_FN
1187
#undef NEON_QDMULH32
1188

    
1189
uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1190
{
1191
    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1192
           | ((x >> 24) & 0xff000000u);
1193
}
1194

    
1195
uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1196
{
1197
    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1198
}
1199

    
1200
uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1201
{
1202
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1203
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1204
}
1205

    
1206
uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1207
{
1208
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1209
}
1210

    
1211
uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1212
{
1213
    x &= 0xff80ff80ff80ff80ull;
1214
    x += 0x0080008000800080ull;
1215
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1216
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1217
}
1218

    
1219
uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1220
{
1221
    x &= 0xffff8000ffff8000ull;
1222
    x += 0x0000800000008000ull;
1223
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1224
}
1225

    
1226
uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
1227
{
1228
    uint16_t s;
1229
    uint8_t d;
1230
    uint32_t res = 0;
1231
#define SAT8(n) \
1232
    s = x >> n; \
1233
    if (s & 0x8000) { \
1234
        SET_QC(); \
1235
    } else { \
1236
        if (s > 0xff) { \
1237
            d = 0xff; \
1238
            SET_QC(); \
1239
        } else  { \
1240
            d = s; \
1241
        } \
1242
        res |= (uint32_t)d << (n / 2); \
1243
    }
1244

    
1245
    SAT8(0);
1246
    SAT8(16);
1247
    SAT8(32);
1248
    SAT8(48);
1249
#undef SAT8
1250
    return res;
1251
}
1252

    
1253
uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
1254
{
1255
    uint16_t s;
1256
    uint8_t d;
1257
    uint32_t res = 0;
1258
#define SAT8(n) \
1259
    s = x >> n; \
1260
    if (s > 0xff) { \
1261
        d = 0xff; \
1262
        SET_QC(); \
1263
    } else  { \
1264
        d = s; \
1265
    } \
1266
    res |= (uint32_t)d << (n / 2);
1267

    
1268
    SAT8(0);
1269
    SAT8(16);
1270
    SAT8(32);
1271
    SAT8(48);
1272
#undef SAT8
1273
    return res;
1274
}
1275

    
1276
uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
1277
{
1278
    int16_t s;
1279
    uint8_t d;
1280
    uint32_t res = 0;
1281
#define SAT8(n) \
1282
    s = x >> n; \
1283
    if (s != (int8_t)s) { \
1284
        d = (s >> 15) ^ 0x7f; \
1285
        SET_QC(); \
1286
    } else  { \
1287
        d = s; \
1288
    } \
1289
    res |= (uint32_t)d << (n / 2);
1290

    
1291
    SAT8(0);
1292
    SAT8(16);
1293
    SAT8(32);
1294
    SAT8(48);
1295
#undef SAT8
1296
    return res;
1297
}
1298

    
1299
uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
1300
{
1301
    uint32_t high;
1302
    uint32_t low;
1303
    low = x;
1304
    if (low & 0x80000000) {
1305
        low = 0;
1306
        SET_QC();
1307
    } else if (low > 0xffff) {
1308
        low = 0xffff;
1309
        SET_QC();
1310
    }
1311
    high = x >> 32;
1312
    if (high & 0x80000000) {
1313
        high = 0;
1314
        SET_QC();
1315
    } else if (high > 0xffff) {
1316
        high = 0xffff;
1317
        SET_QC();
1318
    }
1319
    return low | (high << 16);
1320
}
1321

    
1322
uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1323
{
1324
    uint32_t high;
1325
    uint32_t low;
1326
    low = x;
1327
    if (low > 0xffff) {
1328
        low = 0xffff;
1329
        SET_QC();
1330
    }
1331
    high = x >> 32;
1332
    if (high > 0xffff) {
1333
        high = 0xffff;
1334
        SET_QC();
1335
    }
1336
    return low | (high << 16);
1337
}
1338

    
1339
uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1340
{
1341
    int32_t low;
1342
    int32_t high;
1343
    low = x;
1344
    if (low != (int16_t)low) {
1345
        low = (low >> 31) ^ 0x7fff;
1346
        SET_QC();
1347
    }
1348
    high = x >> 32;
1349
    if (high != (int16_t)high) {
1350
        high = (high >> 31) ^ 0x7fff;
1351
        SET_QC();
1352
    }
1353
    return (uint16_t)low | (high << 16);
1354
}
1355

    
1356
uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1357
{
1358
    if (x & 0x8000000000000000ull) {
1359
        SET_QC();
1360
        return 0;
1361
    }
1362
    if (x > 0xffffffffu) {
1363
        SET_QC();
1364
        return 0xffffffffu;
1365
    }
1366
    return x;
1367
}
1368

    
1369
uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1370
{
1371
    if (x > 0xffffffffu) {
1372
        SET_QC();
1373
        return 0xffffffffu;
1374
    }
1375
    return x;
1376
}
1377

    
1378
uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1379
{
1380
    if ((int64_t)x != (int32_t)x) {
1381
        SET_QC();
1382
        return ((int64_t)x >> 63) ^ 0x7fffffff;
1383
    }
1384
    return x;
1385
}
1386

    
1387
uint64_t HELPER(neon_widen_u8)(uint32_t x)
1388
{
1389
    uint64_t tmp;
1390
    uint64_t ret;
1391
    ret = (uint8_t)x;
1392
    tmp = (uint8_t)(x >> 8);
1393
    ret |= tmp << 16;
1394
    tmp = (uint8_t)(x >> 16);
1395
    ret |= tmp << 32;
1396
    tmp = (uint8_t)(x >> 24);
1397
    ret |= tmp << 48;
1398
    return ret;
1399
}
1400

    
1401
uint64_t HELPER(neon_widen_s8)(uint32_t x)
1402
{
1403
    uint64_t tmp;
1404
    uint64_t ret;
1405
    ret = (uint16_t)(int8_t)x;
1406
    tmp = (uint16_t)(int8_t)(x >> 8);
1407
    ret |= tmp << 16;
1408
    tmp = (uint16_t)(int8_t)(x >> 16);
1409
    ret |= tmp << 32;
1410
    tmp = (uint16_t)(int8_t)(x >> 24);
1411
    ret |= tmp << 48;
1412
    return ret;
1413
}
1414

    
1415
uint64_t HELPER(neon_widen_u16)(uint32_t x)
1416
{
1417
    uint64_t high = (uint16_t)(x >> 16);
1418
    return ((uint16_t)x) | (high << 32);
1419
}
1420

    
1421
uint64_t HELPER(neon_widen_s16)(uint32_t x)
1422
{
1423
    uint64_t high = (int16_t)(x >> 16);
1424
    return ((uint32_t)(int16_t)x) | (high << 32);
1425
}
1426

    
1427
uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1428
{
1429
    uint64_t mask;
1430
    mask = (a ^ b) & 0x8000800080008000ull;
1431
    a &= ~0x8000800080008000ull;
1432
    b &= ~0x8000800080008000ull;
1433
    return (a + b) ^ mask;
1434
}
1435

    
1436
uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1437
{
1438
    uint64_t mask;
1439
    mask = (a ^ b) & 0x8000000080000000ull;
1440
    a &= ~0x8000000080000000ull;
1441
    b &= ~0x8000000080000000ull;
1442
    return (a + b) ^ mask;
1443
}
1444

    
1445
uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1446
{
1447
    uint64_t tmp;
1448
    uint64_t tmp2;
1449

    
1450
    tmp = a & 0x0000ffff0000ffffull;
1451
    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1452
    tmp2 = b & 0xffff0000ffff0000ull;
1453
    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1454
    return    ( tmp         & 0xffff)
1455
            | ((tmp  >> 16) & 0xffff0000ull)
1456
            | ((tmp2 << 16) & 0xffff00000000ull)
1457
            | ( tmp2        & 0xffff000000000000ull);
1458
}
1459

    
1460
uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1461
{
1462
    uint32_t low = a + (a >> 32);
1463
    uint32_t high = b + (b >> 32);
1464
    return low + ((uint64_t)high << 32);
1465
}
1466

    
1467
uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1468
{
1469
    uint64_t mask;
1470
    mask = (a ^ ~b) & 0x8000800080008000ull;
1471
    a |= 0x8000800080008000ull;
1472
    b &= ~0x8000800080008000ull;
1473
    return (a - b) ^ mask;
1474
}
1475

    
1476
uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1477
{
1478
    uint64_t mask;
1479
    mask = (a ^ ~b) & 0x8000000080000000ull;
1480
    a |= 0x8000000080000000ull;
1481
    b &= ~0x8000000080000000ull;
1482
    return (a - b) ^ mask;
1483
}
1484

    
1485
uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1486
{
1487
    uint32_t x, y;
1488
    uint32_t low, high;
1489

    
1490
    x = a;
1491
    y = b;
1492
    low = x + y;
1493
    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1494
        SET_QC();
1495
        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1496
    }
1497
    x = a >> 32;
1498
    y = b >> 32;
1499
    high = x + y;
1500
    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1501
        SET_QC();
1502
        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1503
    }
1504
    return low | ((uint64_t)high << 32);
1505
}
1506

    
1507
uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1508
{
1509
    uint64_t result;
1510

    
1511
    result = a + b;
1512
    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1513
        SET_QC();
1514
        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1515
    }
1516
    return result;
1517
}
1518

    
1519
/* We have to do the arithmetic in a larger type than
1520
 * the input type, because for example with a signed 32 bit
1521
 * op the absolute difference can overflow a signed 32 bit value.
1522
 */
1523
#define DO_ABD(dest, x, y, intype, arithtype) do {            \
1524
    arithtype tmp_x = (intype)(x);                            \
1525
    arithtype tmp_y = (intype)(y);                            \
1526
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1527
    } while(0)
1528

    
1529
uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1530
{
1531
    uint64_t tmp;
1532
    uint64_t result;
1533
    DO_ABD(result, a, b, uint8_t, uint32_t);
1534
    DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1535
    result |= tmp << 16;
1536
    DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1537
    result |= tmp << 32;
1538
    DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1539
    result |= tmp << 48;
1540
    return result;
1541
}
1542

    
1543
uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1544
{
1545
    uint64_t tmp;
1546
    uint64_t result;
1547
    DO_ABD(result, a, b, int8_t, int32_t);
1548
    DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1549
    result |= tmp << 16;
1550
    DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1551
    result |= tmp << 32;
1552
    DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1553
    result |= tmp << 48;
1554
    return result;
1555
}
1556

    
1557
uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1558
{
1559
    uint64_t tmp;
1560
    uint64_t result;
1561
    DO_ABD(result, a, b, uint16_t, uint32_t);
1562
    DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1563
    return result | (tmp << 32);
1564
}
1565

    
1566
uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1567
{
1568
    uint64_t tmp;
1569
    uint64_t result;
1570
    DO_ABD(result, a, b, int16_t, int32_t);
1571
    DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1572
    return result | (tmp << 32);
1573
}
1574

    
1575
uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1576
{
1577
    uint64_t result;
1578
    DO_ABD(result, a, b, uint32_t, uint64_t);
1579
    return result;
1580
}
1581

    
1582
uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1583
{
1584
    uint64_t result;
1585
    DO_ABD(result, a, b, int32_t, int64_t);
1586
    return result;
1587
}
1588
#undef DO_ABD
1589

    
1590
/* Widening multiply. Named type is the source type.  */
1591
#define DO_MULL(dest, x, y, type1, type2) do { \
1592
    type1 tmp_x = x; \
1593
    type1 tmp_y = y; \
1594
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1595
    } while(0)
1596

    
1597
uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1598
{
1599
    uint64_t tmp;
1600
    uint64_t result;
1601

    
1602
    DO_MULL(result, a, b, uint8_t, uint16_t);
1603
    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1604
    result |= tmp << 16;
1605
    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1606
    result |= tmp << 32;
1607
    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1608
    result |= tmp << 48;
1609
    return result;
1610
}
1611

    
1612
uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1613
{
1614
    uint64_t tmp;
1615
    uint64_t result;
1616

    
1617
    DO_MULL(result, a, b, int8_t, uint16_t);
1618
    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1619
    result |= tmp << 16;
1620
    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1621
    result |= tmp << 32;
1622
    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1623
    result |= tmp << 48;
1624
    return result;
1625
}
1626

    
1627
uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1628
{
1629
    uint64_t tmp;
1630
    uint64_t result;
1631

    
1632
    DO_MULL(result, a, b, uint16_t, uint32_t);
1633
    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1634
    return result | (tmp << 32);
1635
}
1636

    
1637
uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1638
{
1639
    uint64_t tmp;
1640
    uint64_t result;
1641

    
1642
    DO_MULL(result, a, b, int16_t, uint32_t);
1643
    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1644
    return result | (tmp << 32);
1645
}
1646

    
1647
uint64_t HELPER(neon_negl_u16)(uint64_t x)
1648
{
1649
    uint16_t tmp;
1650
    uint64_t result;
1651
    result = (uint16_t)-x;
1652
    tmp = -(x >> 16);
1653
    result |= (uint64_t)tmp << 16;
1654
    tmp = -(x >> 32);
1655
    result |= (uint64_t)tmp << 32;
1656
    tmp = -(x >> 48);
1657
    result |= (uint64_t)tmp << 48;
1658
    return result;
1659
}
1660

    
1661
uint64_t HELPER(neon_negl_u32)(uint64_t x)
1662
{
1663
    uint32_t low = -x;
1664
    uint32_t high = -(x >> 32);
1665
    return low | ((uint64_t)high << 32);
1666
}
1667

    
1668
/* FIXME:  There should be a native op for this.  */
1669
uint64_t HELPER(neon_negl_u64)(uint64_t x)
1670
{
1671
    return -x;
1672
}
1673

    
1674
/* Saturnating sign manuipulation.  */
1675
/* ??? Make these use NEON_VOP1 */
1676
#define DO_QABS8(x) do { \
1677
    if (x == (int8_t)0x80) { \
1678
        x = 0x7f; \
1679
        SET_QC(); \
1680
    } else if (x < 0) { \
1681
        x = -x; \
1682
    }} while (0)
1683
uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1684
{
1685
    neon_s8 vec;
1686
    NEON_UNPACK(neon_s8, vec, x);
1687
    DO_QABS8(vec.v1);
1688
    DO_QABS8(vec.v2);
1689
    DO_QABS8(vec.v3);
1690
    DO_QABS8(vec.v4);
1691
    NEON_PACK(neon_s8, x, vec);
1692
    return x;
1693
}
1694
#undef DO_QABS8
1695

    
1696
#define DO_QNEG8(x) do { \
1697
    if (x == (int8_t)0x80) { \
1698
        x = 0x7f; \
1699
        SET_QC(); \
1700
    } else { \
1701
        x = -x; \
1702
    }} while (0)
1703
uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1704
{
1705
    neon_s8 vec;
1706
    NEON_UNPACK(neon_s8, vec, x);
1707
    DO_QNEG8(vec.v1);
1708
    DO_QNEG8(vec.v2);
1709
    DO_QNEG8(vec.v3);
1710
    DO_QNEG8(vec.v4);
1711
    NEON_PACK(neon_s8, x, vec);
1712
    return x;
1713
}
1714
#undef DO_QNEG8
1715

    
1716
#define DO_QABS16(x) do { \
1717
    if (x == (int16_t)0x8000) { \
1718
        x = 0x7fff; \
1719
        SET_QC(); \
1720
    } else if (x < 0) { \
1721
        x = -x; \
1722
    }} while (0)
1723
uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1724
{
1725
    neon_s16 vec;
1726
    NEON_UNPACK(neon_s16, vec, x);
1727
    DO_QABS16(vec.v1);
1728
    DO_QABS16(vec.v2);
1729
    NEON_PACK(neon_s16, x, vec);
1730
    return x;
1731
}
1732
#undef DO_QABS16
1733

    
1734
#define DO_QNEG16(x) do { \
1735
    if (x == (int16_t)0x8000) { \
1736
        x = 0x7fff; \
1737
        SET_QC(); \
1738
    } else { \
1739
        x = -x; \
1740
    }} while (0)
1741
uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1742
{
1743
    neon_s16 vec;
1744
    NEON_UNPACK(neon_s16, vec, x);
1745
    DO_QNEG16(vec.v1);
1746
    DO_QNEG16(vec.v2);
1747
    NEON_PACK(neon_s16, x, vec);
1748
    return x;
1749
}
1750
#undef DO_QNEG16
1751

    
1752
uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1753
{
1754
    if (x == SIGNBIT) {
1755
        SET_QC();
1756
        x = ~SIGNBIT;
1757
    } else if ((int32_t)x < 0) {
1758
        x = -x;
1759
    }
1760
    return x;
1761
}
1762

    
1763
uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1764
{
1765
    if (x == SIGNBIT) {
1766
        SET_QC();
1767
        x = ~SIGNBIT;
1768
    } else {
1769
        x = -x;
1770
    }
1771
    return x;
1772
}
1773

    
1774
/* NEON Float helpers.  */
1775
uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b, void *fpstp)
1776
{
1777
    float_status *fpst = fpstp;
1778
    return float32_val(float32_min(make_float32(a), make_float32(b), fpst));
1779
}
1780

    
1781
uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b, void *fpstp)
1782
{
1783
    float_status *fpst = fpstp;
1784
    return float32_val(float32_max(make_float32(a), make_float32(b), fpst));
1785
}
1786

    
1787
uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp)
1788
{
1789
    float_status *fpst = fpstp;
1790
    float32 f0 = make_float32(a);
1791
    float32 f1 = make_float32(b);
1792
    return float32_val(float32_abs(float32_sub(f0, f1, fpst)));
1793
}
1794

    
1795
/* Floating point comparisons produce an integer result.
1796
 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1797
 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1798
 */
1799
uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1800
{
1801
    float_status *fpst = fpstp;
1802
    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1803
}
1804

    
1805
uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1806
{
1807
    float_status *fpst = fpstp;
1808
    return -float32_le(make_float32(b), make_float32(a), fpst);
1809
}
1810

    
1811
uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1812
{
1813
    float_status *fpst = fpstp;
1814
    return -float32_lt(make_float32(b), make_float32(a), fpst);
1815
}
1816

    
1817
uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1818
{
1819
    float_status *fpst = fpstp;
1820
    float32 f0 = float32_abs(make_float32(a));
1821
    float32 f1 = float32_abs(make_float32(b));
1822
    return -float32_le(f1, f0, fpst);
1823
}
1824

    
1825
uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1826
{
1827
    float_status *fpst = fpstp;
1828
    float32 f0 = float32_abs(make_float32(a));
1829
    float32 f1 = float32_abs(make_float32(b));
1830
    return -float32_lt(f1, f0, fpst);
1831
}
1832

    
1833
#define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1834

    
1835
void HELPER(neon_qunzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
1836
{
1837
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1838
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1839
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1840
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1841
    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1842
        | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1843
        | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1844
        | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1845
    uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1846
        | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1847
        | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1848
        | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1849
    uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1850
        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1851
        | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1852
        | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1853
    uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1854
        | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1855
        | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1856
        | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1857
    env->vfp.regs[rm] = make_float64(m0);
1858
    env->vfp.regs[rm + 1] = make_float64(m1);
1859
    env->vfp.regs[rd] = make_float64(d0);
1860
    env->vfp.regs[rd + 1] = make_float64(d1);
1861
}
1862

    
1863
void HELPER(neon_qunzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
1864
{
1865
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1866
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1867
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1868
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1869
    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1870
        | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1871
    uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1872
        | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1873
    uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1874
        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1875
    uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1876
        | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1877
    env->vfp.regs[rm] = make_float64(m0);
1878
    env->vfp.regs[rm + 1] = make_float64(m1);
1879
    env->vfp.regs[rd] = make_float64(d0);
1880
    env->vfp.regs[rd + 1] = make_float64(d1);
1881
}
1882

    
1883
void HELPER(neon_qunzip32)(CPUARMState *env, uint32_t rd, uint32_t rm)
1884
{
1885
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1886
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1887
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1888
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1889
    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1890
    uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1891
    uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1892
    uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1893
    env->vfp.regs[rm] = make_float64(m0);
1894
    env->vfp.regs[rm + 1] = make_float64(m1);
1895
    env->vfp.regs[rd] = make_float64(d0);
1896
    env->vfp.regs[rd + 1] = make_float64(d1);
1897
}
1898

    
1899
void HELPER(neon_unzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
1900
{
1901
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1902
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1903
    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1904
        | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1905
        | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1906
        | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1907
    uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1908
        | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1909
        | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1910
        | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1911
    env->vfp.regs[rm] = make_float64(m0);
1912
    env->vfp.regs[rd] = make_float64(d0);
1913
}
1914

    
1915
void HELPER(neon_unzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
1916
{
1917
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1918
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1919
    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1920
        | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1921
    uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1922
        | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1923
    env->vfp.regs[rm] = make_float64(m0);
1924
    env->vfp.regs[rd] = make_float64(d0);
1925
}
1926

    
1927
void HELPER(neon_qzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
1928
{
1929
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1930
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1931
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1932
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1933
    uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1934
        | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1935
        | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1936
        | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1937
    uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1938
        | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1939
        | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1940
        | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1941
    uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1942
        | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1943
        | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1944
        | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1945
    uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1946
        | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1947
        | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1948
        | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1949
    env->vfp.regs[rm] = make_float64(m0);
1950
    env->vfp.regs[rm + 1] = make_float64(m1);
1951
    env->vfp.regs[rd] = make_float64(d0);
1952
    env->vfp.regs[rd + 1] = make_float64(d1);
1953
}
1954

    
1955
void HELPER(neon_qzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
1956
{
1957
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1958
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1959
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1960
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1961
    uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1962
        | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1963
    uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1964
        | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1965
    uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1966
        | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1967
    uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1968
        | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1969
    env->vfp.regs[rm] = make_float64(m0);
1970
    env->vfp.regs[rm + 1] = make_float64(m1);
1971
    env->vfp.regs[rd] = make_float64(d0);
1972
    env->vfp.regs[rd + 1] = make_float64(d1);
1973
}
1974

    
1975
void HELPER(neon_qzip32)(CPUARMState *env, uint32_t rd, uint32_t rm)
1976
{
1977
    uint64_t zm0 = float64_val(env->vfp.regs[rm]);
1978
    uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
1979
    uint64_t zd0 = float64_val(env->vfp.regs[rd]);
1980
    uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
1981
    uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1982
    uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1983
    uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1984
    uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1985
    env->vfp.regs[rm] = make_float64(m0);
1986
    env->vfp.regs[rm + 1] = make_float64(m1);
1987
    env->vfp.regs[rd] = make_float64(d0);
1988
    env->vfp.regs[rd + 1] = make_float64(d1);
1989
}
1990

    
1991
void HELPER(neon_zip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
1992
{
1993
    uint64_t zm = float64_val(env->vfp.regs[rm]);
1994
    uint64_t zd = float64_val(env->vfp.regs[rd]);
1995
    uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1996
        | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1997
        | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1998
        | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1999
    uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
2000
        | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
2001
        | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
2002
        | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
2003
    env->vfp.regs[rm] = make_float64(m0);
2004
    env->vfp.regs[rd] = make_float64(d0);
2005
}
2006

    
2007
void HELPER(neon_zip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
2008
{
2009
    uint64_t zm = float64_val(env->vfp.regs[rm]);
2010
    uint64_t zd = float64_val(env->vfp.regs[rd]);
2011
    uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
2012
        | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
2013
    uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
2014
        | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
2015
    env->vfp.regs[rm] = make_float64(m0);
2016
    env->vfp.regs[rd] = make_float64(d0);
2017
}