Statistics
| Branch: | Revision:

root / target-arm / neon_helper.c @ e5ca24cb

History | View | Annotate | Download (39.5 kB)

1
/*
2
 * ARM NEON vector operations.
3
 *
4
 * Copyright (c) 2007, 2008 CodeSourcery.
5
 * Written by Paul Brook
6
 *
7
 * This code is licenced under the GNU GPL v2.
8
 */
9
#include <stdlib.h>
10
#include <stdio.h>
11

    
12
#include "cpu.h"
13
#include "exec-all.h"
14
#include "helpers.h"
15

    
16
#define SIGNBIT (uint32_t)0x80000000
17
#define SIGNBIT64 ((uint64_t)1 << 63)
18

    
19
#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
20

    
21
static float_status neon_float_status;
22
#define NFS &neon_float_status
23

    
24
/* Helper routines to perform bitwise copies between float and int.  */
25
static inline float32 vfp_itos(uint32_t i)
26
{
27
    union {
28
        uint32_t i;
29
        float32 s;
30
    } v;
31

    
32
    v.i = i;
33
    return v.s;
34
}
35

    
36
static inline uint32_t vfp_stoi(float32 s)
37
{
38
    union {
39
        uint32_t i;
40
        float32 s;
41
    } v;
42

    
43
    v.s = s;
44
    return v.i;
45
}
46

    
47
#define NEON_TYPE1(name, type) \
48
typedef struct \
49
{ \
50
    type v1; \
51
} neon_##name;
52
#ifdef HOST_WORDS_BIGENDIAN
53
#define NEON_TYPE2(name, type) \
54
typedef struct \
55
{ \
56
    type v2; \
57
    type v1; \
58
} neon_##name;
59
#define NEON_TYPE4(name, type) \
60
typedef struct \
61
{ \
62
    type v4; \
63
    type v3; \
64
    type v2; \
65
    type v1; \
66
} neon_##name;
67
#else
68
#define NEON_TYPE2(name, type) \
69
typedef struct \
70
{ \
71
    type v1; \
72
    type v2; \
73
} neon_##name;
74
#define NEON_TYPE4(name, type) \
75
typedef struct \
76
{ \
77
    type v1; \
78
    type v2; \
79
    type v3; \
80
    type v4; \
81
} neon_##name;
82
#endif
83

    
84
NEON_TYPE4(s8, int8_t)
85
NEON_TYPE4(u8, uint8_t)
86
NEON_TYPE2(s16, int16_t)
87
NEON_TYPE2(u16, uint16_t)
88
NEON_TYPE1(s32, int32_t)
89
NEON_TYPE1(u32, uint32_t)
90
#undef NEON_TYPE4
91
#undef NEON_TYPE2
92
#undef NEON_TYPE1
93

    
94
/* Copy from a uint32_t to a vector structure type.  */
95
#define NEON_UNPACK(vtype, dest, val) do { \
96
    union { \
97
        vtype v; \
98
        uint32_t i; \
99
    } conv_u; \
100
    conv_u.i = (val); \
101
    dest = conv_u.v; \
102
    } while(0)
103

    
104
/* Copy from a vector structure type to a uint32_t.  */
105
#define NEON_PACK(vtype, dest, val) do { \
106
    union { \
107
        vtype v; \
108
        uint32_t i; \
109
    } conv_u; \
110
    conv_u.v = (val); \
111
    dest = conv_u.i; \
112
    } while(0)
113

    
114
#define NEON_DO1 \
115
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
116
#define NEON_DO2 \
117
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
118
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
119
#define NEON_DO4 \
120
    NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
121
    NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
122
    NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
123
    NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
124

    
125
#define NEON_VOP_BODY(vtype, n) \
126
{ \
127
    uint32_t res; \
128
    vtype vsrc1; \
129
    vtype vsrc2; \
130
    vtype vdest; \
131
    NEON_UNPACK(vtype, vsrc1, arg1); \
132
    NEON_UNPACK(vtype, vsrc2, arg2); \
133
    NEON_DO##n; \
134
    NEON_PACK(vtype, res, vdest); \
135
    return res; \
136
}
137

    
138
#define NEON_VOP(name, vtype, n) \
139
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
140
NEON_VOP_BODY(vtype, n)
141

    
142
#define NEON_VOP_ENV(name, vtype, n) \
143
uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
144
NEON_VOP_BODY(vtype, n)
145

    
146
/* Pairwise operations.  */
147
/* For 32-bit elements each segment only contains a single element, so
148
   the elementwise and pairwise operations are the same.  */
149
#define NEON_PDO2 \
150
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
151
    NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
152
#define NEON_PDO4 \
153
    NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
154
    NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
155
    NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
156
    NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
157

    
158
#define NEON_POP(name, vtype, n) \
159
uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
160
{ \
161
    uint32_t res; \
162
    vtype vsrc1; \
163
    vtype vsrc2; \
164
    vtype vdest; \
165
    NEON_UNPACK(vtype, vsrc1, arg1); \
166
    NEON_UNPACK(vtype, vsrc2, arg2); \
167
    NEON_PDO##n; \
168
    NEON_PACK(vtype, res, vdest); \
169
    return res; \
170
}
171

    
172
/* Unary operators.  */
173
#define NEON_VOP1(name, vtype, n) \
174
uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
175
{ \
176
    vtype vsrc1; \
177
    vtype vdest; \
178
    NEON_UNPACK(vtype, vsrc1, arg); \
179
    NEON_DO##n; \
180
    NEON_PACK(vtype, arg, vdest); \
181
    return arg; \
182
}
183

    
184

    
185
#define NEON_USAT(dest, src1, src2, type) do { \
186
    uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
187
    if (tmp != (type)tmp) { \
188
        SET_QC(); \
189
        dest = ~0; \
190
    } else { \
191
        dest = tmp; \
192
    }} while(0)
193
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
194
NEON_VOP_ENV(qadd_u8, neon_u8, 4)
195
#undef NEON_FN
196
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
197
NEON_VOP_ENV(qadd_u16, neon_u16, 2)
198
#undef NEON_FN
199
#undef NEON_USAT
200

    
201
uint32_t HELPER(neon_qadd_u32)(CPUState *env, uint32_t a, uint32_t b)
202
{
203
    uint32_t res = a + b;
204
    if (res < a) {
205
        SET_QC();
206
        res = ~0;
207
    }
208
    return res;
209
}
210

    
211
uint64_t HELPER(neon_qadd_u64)(CPUState *env, uint64_t src1, uint64_t src2)
212
{
213
    uint64_t res;
214

    
215
    res = src1 + src2;
216
    if (res < src1) {
217
        SET_QC();
218
        res = ~(uint64_t)0;
219
    }
220
    return res;
221
}
222

    
223
#define NEON_SSAT(dest, src1, src2, type) do { \
224
    int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
225
    if (tmp != (type)tmp) { \
226
        SET_QC(); \
227
        if (src2 > 0) { \
228
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
229
        } else { \
230
            tmp = 1 << (sizeof(type) * 8 - 1); \
231
        } \
232
    } \
233
    dest = tmp; \
234
    } while(0)
235
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
236
NEON_VOP_ENV(qadd_s8, neon_s8, 4)
237
#undef NEON_FN
238
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
239
NEON_VOP_ENV(qadd_s16, neon_s16, 2)
240
#undef NEON_FN
241
#undef NEON_SSAT
242

    
243
uint32_t HELPER(neon_qadd_s32)(CPUState *env, uint32_t a, uint32_t b)
244
{
245
    uint32_t res = a + b;
246
    if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
247
        SET_QC();
248
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
249
    }
250
    return res;
251
}
252

    
253
uint64_t HELPER(neon_qadd_s64)(CPUState *env, uint64_t src1, uint64_t src2)
254
{
255
    uint64_t res;
256

    
257
    res = src1 + src2;
258
    if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
259
        SET_QC();
260
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
261
    }
262
    return res;
263
}
264

    
265
#define NEON_USAT(dest, src1, src2, type) do { \
266
    uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
267
    if (tmp != (type)tmp) { \
268
        SET_QC(); \
269
        dest = 0; \
270
    } else { \
271
        dest = tmp; \
272
    }} while(0)
273
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
274
NEON_VOP_ENV(qsub_u8, neon_u8, 4)
275
#undef NEON_FN
276
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
277
NEON_VOP_ENV(qsub_u16, neon_u16, 2)
278
#undef NEON_FN
279
#undef NEON_USAT
280

    
281
uint32_t HELPER(neon_qsub_u32)(CPUState *env, uint32_t a, uint32_t b)
282
{
283
    uint32_t res = a - b;
284
    if (res > a) {
285
        SET_QC();
286
        res = 0;
287
    }
288
    return res;
289
}
290

    
291
uint64_t HELPER(neon_qsub_u64)(CPUState *env, uint64_t src1, uint64_t src2)
292
{
293
    uint64_t res;
294

    
295
    if (src1 < src2) {
296
        SET_QC();
297
        res = 0;
298
    } else {
299
        res = src1 - src2;
300
    }
301
    return res;
302
}
303

    
304
#define NEON_SSAT(dest, src1, src2, type) do { \
305
    int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
306
    if (tmp != (type)tmp) { \
307
        SET_QC(); \
308
        if (src2 < 0) { \
309
            tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
310
        } else { \
311
            tmp = 1 << (sizeof(type) * 8 - 1); \
312
        } \
313
    } \
314
    dest = tmp; \
315
    } while(0)
316
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
317
NEON_VOP_ENV(qsub_s8, neon_s8, 4)
318
#undef NEON_FN
319
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
320
NEON_VOP_ENV(qsub_s16, neon_s16, 2)
321
#undef NEON_FN
322
#undef NEON_SSAT
323

    
324
uint32_t HELPER(neon_qsub_s32)(CPUState *env, uint32_t a, uint32_t b)
325
{
326
    uint32_t res = a - b;
327
    if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
328
        SET_QC();
329
        res = ~(((int32_t)a >> 31) ^ SIGNBIT);
330
    }
331
    return res;
332
}
333

    
334
uint64_t HELPER(neon_qsub_s64)(CPUState *env, uint64_t src1, uint64_t src2)
335
{
336
    uint64_t res;
337

    
338
    res = src1 - src2;
339
    if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
340
        SET_QC();
341
        res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
342
    }
343
    return res;
344
}
345

    
346
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
347
NEON_VOP(hadd_s8, neon_s8, 4)
348
NEON_VOP(hadd_u8, neon_u8, 4)
349
NEON_VOP(hadd_s16, neon_s16, 2)
350
NEON_VOP(hadd_u16, neon_u16, 2)
351
#undef NEON_FN
352

    
353
int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
354
{
355
    int32_t dest;
356

    
357
    dest = (src1 >> 1) + (src2 >> 1);
358
    if (src1 & src2 & 1)
359
        dest++;
360
    return dest;
361
}
362

    
363
uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
364
{
365
    uint32_t dest;
366

    
367
    dest = (src1 >> 1) + (src2 >> 1);
368
    if (src1 & src2 & 1)
369
        dest++;
370
    return dest;
371
}
372

    
373
#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
374
NEON_VOP(rhadd_s8, neon_s8, 4)
375
NEON_VOP(rhadd_u8, neon_u8, 4)
376
NEON_VOP(rhadd_s16, neon_s16, 2)
377
NEON_VOP(rhadd_u16, neon_u16, 2)
378
#undef NEON_FN
379

    
380
int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
381
{
382
    int32_t dest;
383

    
384
    dest = (src1 >> 1) + (src2 >> 1);
385
    if ((src1 | src2) & 1)
386
        dest++;
387
    return dest;
388
}
389

    
390
uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
391
{
392
    uint32_t dest;
393

    
394
    dest = (src1 >> 1) + (src2 >> 1);
395
    if ((src1 | src2) & 1)
396
        dest++;
397
    return dest;
398
}
399

    
400
#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
401
NEON_VOP(hsub_s8, neon_s8, 4)
402
NEON_VOP(hsub_u8, neon_u8, 4)
403
NEON_VOP(hsub_s16, neon_s16, 2)
404
NEON_VOP(hsub_u16, neon_u16, 2)
405
#undef NEON_FN
406

    
407
int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
408
{
409
    int32_t dest;
410

    
411
    dest = (src1 >> 1) - (src2 >> 1);
412
    if ((~src1) & src2 & 1)
413
        dest--;
414
    return dest;
415
}
416

    
417
uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
418
{
419
    uint32_t dest;
420

    
421
    dest = (src1 >> 1) - (src2 >> 1);
422
    if ((~src1) & src2 & 1)
423
        dest--;
424
    return dest;
425
}
426

    
427
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
428
NEON_VOP(cgt_s8, neon_s8, 4)
429
NEON_VOP(cgt_u8, neon_u8, 4)
430
NEON_VOP(cgt_s16, neon_s16, 2)
431
NEON_VOP(cgt_u16, neon_u16, 2)
432
NEON_VOP(cgt_s32, neon_s32, 1)
433
NEON_VOP(cgt_u32, neon_u32, 1)
434
#undef NEON_FN
435

    
436
#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
437
NEON_VOP(cge_s8, neon_s8, 4)
438
NEON_VOP(cge_u8, neon_u8, 4)
439
NEON_VOP(cge_s16, neon_s16, 2)
440
NEON_VOP(cge_u16, neon_u16, 2)
441
NEON_VOP(cge_s32, neon_s32, 1)
442
NEON_VOP(cge_u32, neon_u32, 1)
443
#undef NEON_FN
444

    
445
#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
446
NEON_VOP(min_s8, neon_s8, 4)
447
NEON_VOP(min_u8, neon_u8, 4)
448
NEON_VOP(min_s16, neon_s16, 2)
449
NEON_VOP(min_u16, neon_u16, 2)
450
NEON_VOP(min_s32, neon_s32, 1)
451
NEON_VOP(min_u32, neon_u32, 1)
452
NEON_POP(pmin_s8, neon_s8, 4)
453
NEON_POP(pmin_u8, neon_u8, 4)
454
NEON_POP(pmin_s16, neon_s16, 2)
455
NEON_POP(pmin_u16, neon_u16, 2)
456
#undef NEON_FN
457

    
458
#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
459
NEON_VOP(max_s8, neon_s8, 4)
460
NEON_VOP(max_u8, neon_u8, 4)
461
NEON_VOP(max_s16, neon_s16, 2)
462
NEON_VOP(max_u16, neon_u16, 2)
463
NEON_VOP(max_s32, neon_s32, 1)
464
NEON_VOP(max_u32, neon_u32, 1)
465
NEON_POP(pmax_s8, neon_s8, 4)
466
NEON_POP(pmax_u8, neon_u8, 4)
467
NEON_POP(pmax_s16, neon_s16, 2)
468
NEON_POP(pmax_u16, neon_u16, 2)
469
#undef NEON_FN
470

    
471
#define NEON_FN(dest, src1, src2) \
472
    dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
473
NEON_VOP(abd_s8, neon_s8, 4)
474
NEON_VOP(abd_u8, neon_u8, 4)
475
NEON_VOP(abd_s16, neon_s16, 2)
476
NEON_VOP(abd_u16, neon_u16, 2)
477
NEON_VOP(abd_s32, neon_s32, 1)
478
NEON_VOP(abd_u32, neon_u32, 1)
479
#undef NEON_FN
480

    
481
#define NEON_FN(dest, src1, src2) do { \
482
    int8_t tmp; \
483
    tmp = (int8_t)src2; \
484
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
485
        tmp <= -(ssize_t)sizeof(src1) * 8) { \
486
        dest = 0; \
487
    } else if (tmp < 0) { \
488
        dest = src1 >> -tmp; \
489
    } else { \
490
        dest = src1 << tmp; \
491
    }} while (0)
492
NEON_VOP(shl_u8, neon_u8, 4)
493
NEON_VOP(shl_u16, neon_u16, 2)
494
NEON_VOP(shl_u32, neon_u32, 1)
495
#undef NEON_FN
496

    
497
uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
498
{
499
    int8_t shift = (int8_t)shiftop;
500
    if (shift >= 64 || shift <= -64) {
501
        val = 0;
502
    } else if (shift < 0) {
503
        val >>= -shift;
504
    } else {
505
        val <<= shift;
506
    }
507
    return val;
508
}
509

    
510
#define NEON_FN(dest, src1, src2) do { \
511
    int8_t tmp; \
512
    tmp = (int8_t)src2; \
513
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
514
        dest = 0; \
515
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
516
        dest = src1 >> (sizeof(src1) * 8 - 1); \
517
    } else if (tmp < 0) { \
518
        dest = src1 >> -tmp; \
519
    } else { \
520
        dest = src1 << tmp; \
521
    }} while (0)
522
NEON_VOP(shl_s8, neon_s8, 4)
523
NEON_VOP(shl_s16, neon_s16, 2)
524
NEON_VOP(shl_s32, neon_s32, 1)
525
#undef NEON_FN
526

    
527
uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
528
{
529
    int8_t shift = (int8_t)shiftop;
530
    int64_t val = valop;
531
    if (shift >= 64) {
532
        val = 0;
533
    } else if (shift <= -64) {
534
        val >>= 63;
535
    } else if (shift < 0) {
536
        val >>= -shift;
537
    } else {
538
        val <<= shift;
539
    }
540
    return val;
541
}
542

    
543
#define NEON_FN(dest, src1, src2) do { \
544
    int8_t tmp; \
545
    tmp = (int8_t)src2; \
546
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
547
        dest = 0; \
548
    } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
549
        dest = src1 >> (sizeof(src1) * 8 - 1); \
550
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
551
        dest = src1 >> (tmp - 1); \
552
        dest++; \
553
        dest >>= 1; \
554
    } else if (tmp < 0) { \
555
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
556
    } else { \
557
        dest = src1 << tmp; \
558
    }} while (0)
559
NEON_VOP(rshl_s8, neon_s8, 4)
560
NEON_VOP(rshl_s16, neon_s16, 2)
561
NEON_VOP(rshl_s32, neon_s32, 1)
562
#undef NEON_FN
563

    
564
uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
565
{
566
    int8_t shift = (int8_t)shiftop;
567
    int64_t val = valop;
568
    if (shift >= 64) {
569
        val = 0;
570
    } else if (shift < -64) {
571
        val >>= 63;
572
    } else if (shift == -63) {
573
        val >>= 63;
574
        val++;
575
        val >>= 1;
576
    } else if (shift < 0) {
577
        val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
578
    } else {
579
        val <<= shift;
580
    }
581
    return val;
582
}
583

    
584
#define NEON_FN(dest, src1, src2) do { \
585
    int8_t tmp; \
586
    tmp = (int8_t)src2; \
587
    if (tmp >= (ssize_t)sizeof(src1) * 8 || \
588
        tmp < -(ssize_t)sizeof(src1) * 8) { \
589
        dest = 0; \
590
    } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
591
        dest = src1 >> (tmp - 1); \
592
    } else if (tmp < 0) { \
593
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
594
    } else { \
595
        dest = src1 << tmp; \
596
    }} while (0)
597
NEON_VOP(rshl_u8, neon_u8, 4)
598
NEON_VOP(rshl_u16, neon_u16, 2)
599
NEON_VOP(rshl_u32, neon_u32, 1)
600
#undef NEON_FN
601

    
602
uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
603
{
604
    int8_t shift = (uint8_t)shiftop;
605
    if (shift >= 64 || shift < 64) {
606
        val = 0;
607
    } else if (shift == -64) {
608
        /* Rounding a 1-bit result just preserves that bit.  */
609
        val >>= 63;
610
    } if (shift < 0) {
611
        val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
612
        val >>= -shift;
613
    } else {
614
        val <<= shift;
615
    }
616
    return val;
617
}
618

    
619
#define NEON_FN(dest, src1, src2) do { \
620
    int8_t tmp; \
621
    tmp = (int8_t)src2; \
622
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
623
        if (src1) { \
624
            SET_QC(); \
625
            dest = ~0; \
626
        } else { \
627
            dest = 0; \
628
        } \
629
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
630
        dest = 0; \
631
    } else if (tmp < 0) { \
632
        dest = src1 >> -tmp; \
633
    } else { \
634
        dest = src1 << tmp; \
635
        if ((dest >> tmp) != src1) { \
636
            SET_QC(); \
637
            dest = ~0; \
638
        } \
639
    }} while (0)
640
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
641
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
642
NEON_VOP_ENV(qshl_u32, neon_u32, 1)
643
#undef NEON_FN
644

    
645
uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
646
{
647
    int8_t shift = (int8_t)shiftop;
648
    if (shift >= 64) {
649
        if (val) {
650
            val = ~(uint64_t)0;
651
            SET_QC();
652
        }
653
    } else if (shift <= -64) {
654
        val = 0;
655
    } else if (shift < 0) {
656
        val >>= -shift;
657
    } else {
658
        uint64_t tmp = val;
659
        val <<= shift;
660
        if ((val >> shift) != tmp) {
661
            SET_QC();
662
            val = ~(uint64_t)0;
663
        }
664
    }
665
    return val;
666
}
667

    
668
#define NEON_FN(dest, src1, src2) do { \
669
    int8_t tmp; \
670
    tmp = (int8_t)src2; \
671
    if (tmp >= (ssize_t)sizeof(src1) * 8) { \
672
        if (src1) { \
673
            SET_QC(); \
674
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
675
            if (src1 > 0) { \
676
                dest--; \
677
            } \
678
        } else { \
679
            dest = src1; \
680
        } \
681
    } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
682
        dest = src1 >> 31; \
683
    } else if (tmp < 0) { \
684
        dest = src1 >> -tmp; \
685
    } else { \
686
        dest = src1 << tmp; \
687
        if ((dest >> tmp) != src1) { \
688
            SET_QC(); \
689
            dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
690
            if (src1 > 0) { \
691
                dest--; \
692
            } \
693
        } \
694
    }} while (0)
695
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
696
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
697
NEON_VOP_ENV(qshl_s32, neon_s32, 1)
698
#undef NEON_FN
699

    
700
uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
701
{
702
    int8_t shift = (uint8_t)shiftop;
703
    int64_t val = valop;
704
    if (shift >= 64) {
705
        if (val) {
706
            SET_QC();
707
            val = (val >> 63) ^ ~SIGNBIT64;
708
        }
709
    } else if (shift <= -64) {
710
        val >>= 63;
711
    } else if (shift < 0) {
712
        val >>= -shift;
713
    } else {
714
        int64_t tmp = val;
715
        val <<= shift;
716
        if ((val >> shift) != tmp) {
717
            SET_QC();
718
            val = (tmp >> 63) ^ ~SIGNBIT64;
719
        }
720
    }
721
    return val;
722
}
723

    
724
#define NEON_FN(dest, src1, src2) do { \
725
    if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
726
        SET_QC(); \
727
        dest = 0; \
728
    } else { \
729
        int8_t tmp; \
730
        tmp = (int8_t)src2; \
731
        if (tmp >= (ssize_t)sizeof(src1) * 8) { \
732
            if (src1) { \
733
                SET_QC(); \
734
                dest = ~0; \
735
            } else { \
736
                dest = 0; \
737
            } \
738
        } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
739
            dest = 0; \
740
        } else if (tmp < 0) { \
741
            dest = src1 >> -tmp; \
742
        } else { \
743
            dest = src1 << tmp; \
744
            if ((dest >> tmp) != src1) { \
745
                SET_QC(); \
746
                dest = ~0; \
747
            } \
748
        } \
749
    }} while (0)
750
NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
751
NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
752
#undef NEON_FN
753

    
754
uint32_t HELPER(neon_qshlu_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
755
{
756
    if ((int32_t)valop < 0) {
757
        SET_QC();
758
        return 0;
759
    }
760
    return helper_neon_qshl_u32(env, valop, shiftop);
761
}
762

    
763
uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
764
{
765
    if ((int64_t)valop < 0) {
766
        SET_QC();
767
        return 0;
768
    }
769
    return helper_neon_qshl_u64(env, valop, shiftop);
770
}
771

    
772
/* FIXME: This is wrong.  */
773
#define NEON_FN(dest, src1, src2) do { \
774
    int8_t tmp; \
775
    tmp = (int8_t)src2; \
776
    if (tmp < 0) { \
777
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
778
    } else { \
779
        dest = src1 << tmp; \
780
        if ((dest >> tmp) != src1) { \
781
            SET_QC(); \
782
            dest = ~0; \
783
        } \
784
    }} while (0)
785
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
786
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
787
NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
788
#undef NEON_FN
789

    
790
uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
791
{
792
    int8_t shift = (int8_t)shiftop;
793
    if (shift < 0) {
794
        val = (val + (1 << (-1 - shift))) >> -shift;
795
    } else { \
796
        uint64_t tmp = val;
797
        val <<= shift;
798
        if ((val >> shift) != tmp) {
799
            SET_QC();
800
            val = ~0;
801
        }
802
    }
803
    return val;
804
}
805

    
806
#define NEON_FN(dest, src1, src2) do { \
807
    int8_t tmp; \
808
    tmp = (int8_t)src2; \
809
    if (tmp < 0) { \
810
        dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
811
    } else { \
812
        dest = src1 << tmp; \
813
        if ((dest >> tmp) != src1) { \
814
            SET_QC(); \
815
            dest = src1 >> 31; \
816
        } \
817
    }} while (0)
818
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
819
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
820
NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
821
#undef NEON_FN
822

    
823
uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
824
{
825
    int8_t shift = (uint8_t)shiftop;
826
    int64_t val = valop;
827

    
828
    if (shift < 0) {
829
        val = (val + (1 << (-1 - shift))) >> -shift;
830
    } else {
831
        int64_t tmp = val;;
832
        val <<= shift;
833
        if ((val >> shift) != tmp) {
834
            SET_QC();
835
            val = tmp >> 31;
836
        }
837
    }
838
    return val;
839
}
840

    
841
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
842
{
843
    uint32_t mask;
844
    mask = (a ^ b) & 0x80808080u;
845
    a &= ~0x80808080u;
846
    b &= ~0x80808080u;
847
    return (a + b) ^ mask;
848
}
849

    
850
uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
851
{
852
    uint32_t mask;
853
    mask = (a ^ b) & 0x80008000u;
854
    a &= ~0x80008000u;
855
    b &= ~0x80008000u;
856
    return (a + b) ^ mask;
857
}
858

    
859
#define NEON_FN(dest, src1, src2) dest = src1 + src2
860
NEON_POP(padd_u8, neon_u8, 4)
861
NEON_POP(padd_u16, neon_u16, 2)
862
#undef NEON_FN
863

    
864
#define NEON_FN(dest, src1, src2) dest = src1 - src2
865
NEON_VOP(sub_u8, neon_u8, 4)
866
NEON_VOP(sub_u16, neon_u16, 2)
867
#undef NEON_FN
868

    
869
#define NEON_FN(dest, src1, src2) dest = src1 * src2
870
NEON_VOP(mul_u8, neon_u8, 4)
871
NEON_VOP(mul_u16, neon_u16, 2)
872
#undef NEON_FN
873

    
874
/* Polynomial multiplication is like integer multiplication except the
875
   partial products are XORed, not added.  */
876
uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
877
{
878
    uint32_t mask;
879
    uint32_t result;
880
    result = 0;
881
    while (op1) {
882
        mask = 0;
883
        if (op1 & 1)
884
            mask |= 0xff;
885
        if (op1 & (1 << 8))
886
            mask |= (0xff << 8);
887
        if (op1 & (1 << 16))
888
            mask |= (0xff << 16);
889
        if (op1 & (1 << 24))
890
            mask |= (0xff << 24);
891
        result ^= op2 & mask;
892
        op1 = (op1 >> 1) & 0x7f7f7f7f;
893
        op2 = (op2 << 1) & 0xfefefefe;
894
    }
895
    return result;
896
}
897

    
898
uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
899
{
900
    uint64_t result = 0;
901
    uint64_t mask;
902
    uint64_t op2ex = op2;
903
    op2ex = (op2ex & 0xff) |
904
        ((op2ex & 0xff00) << 8) |
905
        ((op2ex & 0xff0000) << 16) |
906
        ((op2ex & 0xff000000) << 24);
907
    while (op1) {
908
        mask = 0;
909
        if (op1 & 1) {
910
            mask |= 0xffff;
911
        }
912
        if (op1 & (1 << 8)) {
913
            mask |= (0xffffU << 16);
914
        }
915
        if (op1 & (1 << 16)) {
916
            mask |= (0xffffULL << 32);
917
        }
918
        if (op1 & (1 << 24)) {
919
            mask |= (0xffffULL << 48);
920
        }
921
        result ^= op2ex & mask;
922
        op1 = (op1 >> 1) & 0x7f7f7f7f;
923
        op2ex <<= 1;
924
    }
925
    return result;
926
}
927

    
928
#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
929
NEON_VOP(tst_u8, neon_u8, 4)
930
NEON_VOP(tst_u16, neon_u16, 2)
931
NEON_VOP(tst_u32, neon_u32, 1)
932
#undef NEON_FN
933

    
934
#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
935
NEON_VOP(ceq_u8, neon_u8, 4)
936
NEON_VOP(ceq_u16, neon_u16, 2)
937
NEON_VOP(ceq_u32, neon_u32, 1)
938
#undef NEON_FN
939

    
940
#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
941
NEON_VOP1(abs_s8, neon_s8, 4)
942
NEON_VOP1(abs_s16, neon_s16, 2)
943
#undef NEON_FN
944

    
945
/* Count Leading Sign/Zero Bits.  */
946
static inline int do_clz8(uint8_t x)
947
{
948
    int n;
949
    for (n = 8; x; n--)
950
        x >>= 1;
951
    return n;
952
}
953

    
954
static inline int do_clz16(uint16_t x)
955
{
956
    int n;
957
    for (n = 16; x; n--)
958
        x >>= 1;
959
    return n;
960
}
961

    
962
#define NEON_FN(dest, src, dummy) dest = do_clz8(src)
963
NEON_VOP1(clz_u8, neon_u8, 4)
964
#undef NEON_FN
965

    
966
#define NEON_FN(dest, src, dummy) dest = do_clz16(src)
967
NEON_VOP1(clz_u16, neon_u16, 2)
968
#undef NEON_FN
969

    
970
#define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
971
NEON_VOP1(cls_s8, neon_s8, 4)
972
#undef NEON_FN
973

    
974
#define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
975
NEON_VOP1(cls_s16, neon_s16, 2)
976
#undef NEON_FN
977

    
978
uint32_t HELPER(neon_cls_s32)(uint32_t x)
979
{
980
    int count;
981
    if ((int32_t)x < 0)
982
        x = ~x;
983
    for (count = 32; x; count--)
984
        x = x >> 1;
985
    return count - 1;
986
}
987

    
988
/* Bit count.  */
989
uint32_t HELPER(neon_cnt_u8)(uint32_t x)
990
{
991
    x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
992
    x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
993
    x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
994
    return x;
995
}
996

    
997
#define NEON_QDMULH16(dest, src1, src2, round) do { \
998
    uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
999
    if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
1000
        SET_QC(); \
1001
        tmp = (tmp >> 31) ^ ~SIGNBIT; \
1002
    } else { \
1003
        tmp <<= 1; \
1004
    } \
1005
    if (round) { \
1006
        int32_t old = tmp; \
1007
        tmp += 1 << 15; \
1008
        if ((int32_t)tmp < old) { \
1009
            SET_QC(); \
1010
            tmp = SIGNBIT - 1; \
1011
        } \
1012
    } \
1013
    dest = tmp >> 16; \
1014
    } while(0)
1015
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
1016
NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
1017
#undef NEON_FN
1018
#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
1019
NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
1020
#undef NEON_FN
1021
#undef NEON_QDMULH16
1022

    
1023
#define NEON_QDMULH32(dest, src1, src2, round) do { \
1024
    uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
1025
    if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
1026
        SET_QC(); \
1027
        tmp = (tmp >> 63) ^ ~SIGNBIT64; \
1028
    } else { \
1029
        tmp <<= 1; \
1030
    } \
1031
    if (round) { \
1032
        int64_t old = tmp; \
1033
        tmp += (int64_t)1 << 31; \
1034
        if ((int64_t)tmp < old) { \
1035
            SET_QC(); \
1036
            tmp = SIGNBIT64 - 1; \
1037
        } \
1038
    } \
1039
    dest = tmp >> 32; \
1040
    } while(0)
1041
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
1042
NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
1043
#undef NEON_FN
1044
#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
1045
NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
1046
#undef NEON_FN
1047
#undef NEON_QDMULH32
1048

    
1049
uint32_t HELPER(neon_narrow_u8)(uint64_t x)
1050
{
1051
    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
1052
           | ((x >> 24) & 0xff000000u);
1053
}
1054

    
1055
uint32_t HELPER(neon_narrow_u16)(uint64_t x)
1056
{
1057
    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
1058
}
1059

    
1060
uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
1061
{
1062
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1063
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1064
}
1065

    
1066
uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
1067
{
1068
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1069
}
1070

    
1071
uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
1072
{
1073
    x &= 0xff80ff80ff80ff80ull;
1074
    x += 0x0080008000800080ull;
1075
    return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
1076
            | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
1077
}
1078

    
1079
uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
1080
{
1081
    x &= 0xffff8000ffff8000ull;
1082
    x += 0x0000800000008000ull;
1083
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
1084
}
1085

    
1086
uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x)
1087
{
1088
    uint16_t s;
1089
    uint8_t d;
1090
    uint32_t res = 0;
1091
#define SAT8(n) \
1092
    s = x >> n; \
1093
    if (s & 0x8000) { \
1094
        SET_QC(); \
1095
    } else { \
1096
        if (s > 0xff) { \
1097
            d = 0xff; \
1098
            SET_QC(); \
1099
        } else  { \
1100
            d = s; \
1101
        } \
1102
        res |= (uint32_t)d << (n / 2); \
1103
    }
1104

    
1105
    SAT8(0);
1106
    SAT8(16);
1107
    SAT8(32);
1108
    SAT8(48);
1109
#undef SAT8
1110
    return res;
1111
}
1112

    
1113
uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
1114
{
1115
    uint16_t s;
1116
    uint8_t d;
1117
    uint32_t res = 0;
1118
#define SAT8(n) \
1119
    s = x >> n; \
1120
    if (s > 0xff) { \
1121
        d = 0xff; \
1122
        SET_QC(); \
1123
    } else  { \
1124
        d = s; \
1125
    } \
1126
    res |= (uint32_t)d << (n / 2);
1127

    
1128
    SAT8(0);
1129
    SAT8(16);
1130
    SAT8(32);
1131
    SAT8(48);
1132
#undef SAT8
1133
    return res;
1134
}
1135

    
1136
uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
1137
{
1138
    int16_t s;
1139
    uint8_t d;
1140
    uint32_t res = 0;
1141
#define SAT8(n) \
1142
    s = x >> n; \
1143
    if (s != (int8_t)s) { \
1144
        d = (s >> 15) ^ 0x7f; \
1145
        SET_QC(); \
1146
    } else  { \
1147
        d = s; \
1148
    } \
1149
    res |= (uint32_t)d << (n / 2);
1150

    
1151
    SAT8(0);
1152
    SAT8(16);
1153
    SAT8(32);
1154
    SAT8(48);
1155
#undef SAT8
1156
    return res;
1157
}
1158

    
1159
uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x)
1160
{
1161
    uint32_t high;
1162
    uint32_t low;
1163
    low = x;
1164
    if (low & 0x80000000) {
1165
        low = 0;
1166
        SET_QC();
1167
    } else if (low > 0xffff) {
1168
        low = 0xffff;
1169
        SET_QC();
1170
    }
1171
    high = x >> 32;
1172
    if (high & 0x80000000) {
1173
        high = 0;
1174
        SET_QC();
1175
    } else if (high > 0xffff) {
1176
        high = 0xffff;
1177
        SET_QC();
1178
    }
1179
    return low | (high << 16);
1180
}
1181

    
1182
uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
1183
{
1184
    uint32_t high;
1185
    uint32_t low;
1186
    low = x;
1187
    if (low > 0xffff) {
1188
        low = 0xffff;
1189
        SET_QC();
1190
    }
1191
    high = x >> 32;
1192
    if (high > 0xffff) {
1193
        high = 0xffff;
1194
        SET_QC();
1195
    }
1196
    return low | (high << 16);
1197
}
1198

    
1199
uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
1200
{
1201
    int32_t low;
1202
    int32_t high;
1203
    low = x;
1204
    if (low != (int16_t)low) {
1205
        low = (low >> 31) ^ 0x7fff;
1206
        SET_QC();
1207
    }
1208
    high = x >> 32;
1209
    if (high != (int16_t)high) {
1210
        high = (high >> 31) ^ 0x7fff;
1211
        SET_QC();
1212
    }
1213
    return (uint16_t)low | (high << 16);
1214
}
1215

    
1216
uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x)
1217
{
1218
    if (x & 0x8000000000000000ull) {
1219
        SET_QC();
1220
        return 0;
1221
    }
1222
    if (x > 0xffffffffu) {
1223
        SET_QC();
1224
        return 0xffffffffu;
1225
    }
1226
    return x;
1227
}
1228

    
1229
uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
1230
{
1231
    if (x > 0xffffffffu) {
1232
        SET_QC();
1233
        return 0xffffffffu;
1234
    }
1235
    return x;
1236
}
1237

    
1238
uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
1239
{
1240
    if ((int64_t)x != (int32_t)x) {
1241
        SET_QC();
1242
        return ((int64_t)x >> 63) ^ 0x7fffffff;
1243
    }
1244
    return x;
1245
}
1246

    
1247
uint64_t HELPER(neon_widen_u8)(uint32_t x)
1248
{
1249
    uint64_t tmp;
1250
    uint64_t ret;
1251
    ret = (uint8_t)x;
1252
    tmp = (uint8_t)(x >> 8);
1253
    ret |= tmp << 16;
1254
    tmp = (uint8_t)(x >> 16);
1255
    ret |= tmp << 32;
1256
    tmp = (uint8_t)(x >> 24);
1257
    ret |= tmp << 48;
1258
    return ret;
1259
}
1260

    
1261
uint64_t HELPER(neon_widen_s8)(uint32_t x)
1262
{
1263
    uint64_t tmp;
1264
    uint64_t ret;
1265
    ret = (uint16_t)(int8_t)x;
1266
    tmp = (uint16_t)(int8_t)(x >> 8);
1267
    ret |= tmp << 16;
1268
    tmp = (uint16_t)(int8_t)(x >> 16);
1269
    ret |= tmp << 32;
1270
    tmp = (uint16_t)(int8_t)(x >> 24);
1271
    ret |= tmp << 48;
1272
    return ret;
1273
}
1274

    
1275
uint64_t HELPER(neon_widen_u16)(uint32_t x)
1276
{
1277
    uint64_t high = (uint16_t)(x >> 16);
1278
    return ((uint16_t)x) | (high << 32);
1279
}
1280

    
1281
uint64_t HELPER(neon_widen_s16)(uint32_t x)
1282
{
1283
    uint64_t high = (int16_t)(x >> 16);
1284
    return ((uint32_t)(int16_t)x) | (high << 32);
1285
}
1286

    
1287
uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1288
{
1289
    uint64_t mask;
1290
    mask = (a ^ b) & 0x8000800080008000ull;
1291
    a &= ~0x8000800080008000ull;
1292
    b &= ~0x8000800080008000ull;
1293
    return (a + b) ^ mask;
1294
}
1295

    
1296
uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1297
{
1298
    uint64_t mask;
1299
    mask = (a ^ b) & 0x8000000080000000ull;
1300
    a &= ~0x8000000080000000ull;
1301
    b &= ~0x8000000080000000ull;
1302
    return (a + b) ^ mask;
1303
}
1304

    
1305
uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1306
{
1307
    uint64_t tmp;
1308
    uint64_t tmp2;
1309

    
1310
    tmp = a & 0x0000ffff0000ffffull;
1311
    tmp += (a >> 16) & 0x0000ffff0000ffffull;
1312
    tmp2 = b & 0xffff0000ffff0000ull;
1313
    tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1314
    return    ( tmp         & 0xffff)
1315
            | ((tmp  >> 16) & 0xffff0000ull)
1316
            | ((tmp2 << 16) & 0xffff00000000ull)
1317
            | ( tmp2        & 0xffff000000000000ull);
1318
}
1319

    
1320
uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1321
{
1322
    uint32_t low = a + (a >> 32);
1323
    uint32_t high = b + (b >> 32);
1324
    return low + ((uint64_t)high << 32);
1325
}
1326

    
1327
uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1328
{
1329
    uint64_t mask;
1330
    mask = (a ^ ~b) & 0x8000800080008000ull;
1331
    a |= 0x8000800080008000ull;
1332
    b &= ~0x8000800080008000ull;
1333
    return (a - b) ^ mask;
1334
}
1335

    
1336
uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1337
{
1338
    uint64_t mask;
1339
    mask = (a ^ ~b) & 0x8000000080000000ull;
1340
    a |= 0x8000000080000000ull;
1341
    b &= ~0x8000000080000000ull;
1342
    return (a - b) ^ mask;
1343
}
1344

    
1345
uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
1346
{
1347
    uint32_t x, y;
1348
    uint32_t low, high;
1349

    
1350
    x = a;
1351
    y = b;
1352
    low = x + y;
1353
    if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1354
        SET_QC();
1355
        low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1356
    }
1357
    x = a >> 32;
1358
    y = b >> 32;
1359
    high = x + y;
1360
    if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1361
        SET_QC();
1362
        high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1363
    }
1364
    return low | ((uint64_t)high << 32);
1365
}
1366

    
1367
uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
1368
{
1369
    uint64_t result;
1370

    
1371
    result = a + b;
1372
    if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1373
        SET_QC();
1374
        result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1375
    }
1376
    return result;
1377
}
1378

    
1379
#define DO_ABD(dest, x, y, type) do { \
1380
    type tmp_x = x; \
1381
    type tmp_y = y; \
1382
    dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1383
    } while(0)
1384

    
1385
uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1386
{
1387
    uint64_t tmp;
1388
    uint64_t result;
1389
    DO_ABD(result, a, b, uint8_t);
1390
    DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
1391
    result |= tmp << 16;
1392
    DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
1393
    result |= tmp << 32;
1394
    DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
1395
    result |= tmp << 48;
1396
    return result;
1397
}
1398

    
1399
uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1400
{
1401
    uint64_t tmp;
1402
    uint64_t result;
1403
    DO_ABD(result, a, b, int8_t);
1404
    DO_ABD(tmp, a >> 8, b >> 8, int8_t);
1405
    result |= tmp << 16;
1406
    DO_ABD(tmp, a >> 16, b >> 16, int8_t);
1407
    result |= tmp << 32;
1408
    DO_ABD(tmp, a >> 24, b >> 24, int8_t);
1409
    result |= tmp << 48;
1410
    return result;
1411
}
1412

    
1413
uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1414
{
1415
    uint64_t tmp;
1416
    uint64_t result;
1417
    DO_ABD(result, a, b, uint16_t);
1418
    DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
1419
    return result | (tmp << 32);
1420
}
1421

    
1422
uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1423
{
1424
    uint64_t tmp;
1425
    uint64_t result;
1426
    DO_ABD(result, a, b, int16_t);
1427
    DO_ABD(tmp, a >> 16, b >> 16, int16_t);
1428
    return result | (tmp << 32);
1429
}
1430

    
1431
uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1432
{
1433
    uint64_t result;
1434
    DO_ABD(result, a, b, uint32_t);
1435
    return result;
1436
}
1437

    
1438
uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1439
{
1440
    uint64_t result;
1441
    DO_ABD(result, a, b, int32_t);
1442
    return result;
1443
}
1444
#undef DO_ABD
1445

    
1446
/* Widening multiply. Named type is the source type.  */
1447
#define DO_MULL(dest, x, y, type1, type2) do { \
1448
    type1 tmp_x = x; \
1449
    type1 tmp_y = y; \
1450
    dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1451
    } while(0)
1452

    
1453
uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1454
{
1455
    uint64_t tmp;
1456
    uint64_t result;
1457

    
1458
    DO_MULL(result, a, b, uint8_t, uint16_t);
1459
    DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1460
    result |= tmp << 16;
1461
    DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1462
    result |= tmp << 32;
1463
    DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1464
    result |= tmp << 48;
1465
    return result;
1466
}
1467

    
1468
uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1469
{
1470
    uint64_t tmp;
1471
    uint64_t result;
1472

    
1473
    DO_MULL(result, a, b, int8_t, uint16_t);
1474
    DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1475
    result |= tmp << 16;
1476
    DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1477
    result |= tmp << 32;
1478
    DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1479
    result |= tmp << 48;
1480
    return result;
1481
}
1482

    
1483
uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1484
{
1485
    uint64_t tmp;
1486
    uint64_t result;
1487

    
1488
    DO_MULL(result, a, b, uint16_t, uint32_t);
1489
    DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1490
    return result | (tmp << 32);
1491
}
1492

    
1493
uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1494
{
1495
    uint64_t tmp;
1496
    uint64_t result;
1497

    
1498
    DO_MULL(result, a, b, int16_t, uint32_t);
1499
    DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1500
    return result | (tmp << 32);
1501
}
1502

    
1503
uint64_t HELPER(neon_negl_u16)(uint64_t x)
1504
{
1505
    uint16_t tmp;
1506
    uint64_t result;
1507
    result = (uint16_t)-x;
1508
    tmp = -(x >> 16);
1509
    result |= (uint64_t)tmp << 16;
1510
    tmp = -(x >> 32);
1511
    result |= (uint64_t)tmp << 32;
1512
    tmp = -(x >> 48);
1513
    result |= (uint64_t)tmp << 48;
1514
    return result;
1515
}
1516

    
1517
uint64_t HELPER(neon_negl_u32)(uint64_t x)
1518
{
1519
    uint32_t low = -x;
1520
    uint32_t high = -(x >> 32);
1521
    return low | ((uint64_t)high << 32);
1522
}
1523

    
1524
/* FIXME:  There should be a native op for this.  */
1525
uint64_t HELPER(neon_negl_u64)(uint64_t x)
1526
{
1527
    return -x;
1528
}
1529

    
1530
/* Saturnating sign manuipulation.  */
1531
/* ??? Make these use NEON_VOP1 */
1532
#define DO_QABS8(x) do { \
1533
    if (x == (int8_t)0x80) { \
1534
        x = 0x7f; \
1535
        SET_QC(); \
1536
    } else if (x < 0) { \
1537
        x = -x; \
1538
    }} while (0)
1539
uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
1540
{
1541
    neon_s8 vec;
1542
    NEON_UNPACK(neon_s8, vec, x);
1543
    DO_QABS8(vec.v1);
1544
    DO_QABS8(vec.v2);
1545
    DO_QABS8(vec.v3);
1546
    DO_QABS8(vec.v4);
1547
    NEON_PACK(neon_s8, x, vec);
1548
    return x;
1549
}
1550
#undef DO_QABS8
1551

    
1552
#define DO_QNEG8(x) do { \
1553
    if (x == (int8_t)0x80) { \
1554
        x = 0x7f; \
1555
        SET_QC(); \
1556
    } else { \
1557
        x = -x; \
1558
    }} while (0)
1559
uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
1560
{
1561
    neon_s8 vec;
1562
    NEON_UNPACK(neon_s8, vec, x);
1563
    DO_QNEG8(vec.v1);
1564
    DO_QNEG8(vec.v2);
1565
    DO_QNEG8(vec.v3);
1566
    DO_QNEG8(vec.v4);
1567
    NEON_PACK(neon_s8, x, vec);
1568
    return x;
1569
}
1570
#undef DO_QNEG8
1571

    
1572
#define DO_QABS16(x) do { \
1573
    if (x == (int16_t)0x8000) { \
1574
        x = 0x7fff; \
1575
        SET_QC(); \
1576
    } else if (x < 0) { \
1577
        x = -x; \
1578
    }} while (0)
1579
uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
1580
{
1581
    neon_s16 vec;
1582
    NEON_UNPACK(neon_s16, vec, x);
1583
    DO_QABS16(vec.v1);
1584
    DO_QABS16(vec.v2);
1585
    NEON_PACK(neon_s16, x, vec);
1586
    return x;
1587
}
1588
#undef DO_QABS16
1589

    
1590
#define DO_QNEG16(x) do { \
1591
    if (x == (int16_t)0x8000) { \
1592
        x = 0x7fff; \
1593
        SET_QC(); \
1594
    } else { \
1595
        x = -x; \
1596
    }} while (0)
1597
uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
1598
{
1599
    neon_s16 vec;
1600
    NEON_UNPACK(neon_s16, vec, x);
1601
    DO_QNEG16(vec.v1);
1602
    DO_QNEG16(vec.v2);
1603
    NEON_PACK(neon_s16, x, vec);
1604
    return x;
1605
}
1606
#undef DO_QNEG16
1607

    
1608
uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
1609
{
1610
    if (x == SIGNBIT) {
1611
        SET_QC();
1612
        x = ~SIGNBIT;
1613
    } else if ((int32_t)x < 0) {
1614
        x = -x;
1615
    }
1616
    return x;
1617
}
1618

    
1619
uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
1620
{
1621
    if (x == SIGNBIT) {
1622
        SET_QC();
1623
        x = ~SIGNBIT;
1624
    } else {
1625
        x = -x;
1626
    }
1627
    return x;
1628
}
1629

    
1630
/* NEON Float helpers.  */
1631
uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
1632
{
1633
    float32 f0 = vfp_itos(a);
1634
    float32 f1 = vfp_itos(b);
1635
    return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
1636
}
1637

    
1638
uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
1639
{
1640
    float32 f0 = vfp_itos(a);
1641
    float32 f1 = vfp_itos(b);
1642
    return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
1643
}
1644

    
1645
uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
1646
{
1647
    float32 f0 = vfp_itos(a);
1648
    float32 f1 = vfp_itos(b);
1649
    return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
1650
                    ? float32_sub(f0, f1, NFS)
1651
                    : float32_sub(f1, f0, NFS));
1652
}
1653

    
1654
uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
1655
{
1656
    return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
1657
}
1658

    
1659
uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
1660
{
1661
    return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
1662
}
1663

    
1664
uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
1665
{
1666
    return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
1667
}
1668

    
1669
/* Floating point comparisons produce an integer result.  */
1670
#define NEON_VOP_FCMP(name, cmp) \
1671
uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
1672
{ \
1673
    if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
1674
        return ~0; \
1675
    else \
1676
        return 0; \
1677
}
1678

    
1679
NEON_VOP_FCMP(ceq_f32, ==)
1680
NEON_VOP_FCMP(cge_f32, >=)
1681
NEON_VOP_FCMP(cgt_f32, >)
1682

    
1683
uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
1684
{
1685
    float32 f0 = float32_abs(vfp_itos(a));
1686
    float32 f1 = float32_abs(vfp_itos(b));
1687
    return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
1688
}
1689

    
1690
uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
1691
{
1692
    float32 f0 = float32_abs(vfp_itos(a));
1693
    float32 f1 = float32_abs(vfp_itos(b));
1694
    return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
1695
}